Schuhgröße – Geschlecht#
import pandas as pd
datei = "synthetische_daten_3bis40_mit_BMI_Kategorien.csv"
df = pd.read_csv(datei, index_col="ID")
df.head()
| Alter_Jahre | Geschlecht | Körpergröße_cm | Gewicht_kg | BMI | BMI_Kategorie | Schuhgröße_EU | |
|---|---|---|---|---|---|---|---|
| ID | |||||||
| 1 | 13 | w | 160.2 | 51.1 | 19.9 | Normalgewicht | 38 |
| 2 | 7 | m | 128.1 | 23.6 | 14.4 | Untergewichtig | 31 |
| 3 | 4 | w | 100.7 | 16.5 | 16.3 | Normalgewicht | 25 |
| 4 | 11 | w | 148.9 | 39.5 | 17.8 | Normalgewicht | 36 |
| 5 | 9 | w | 129.7 | 30.1 | 17.9 | Normalgewicht | 31 |
from sklearn.model_selection import train_test_split
y = df["Geschlecht"]
type(y)
pandas.core.series.Series
df.columns
Index(['Alter_Jahre', 'Geschlecht', 'Körpergröße_cm', 'Gewicht_kg', 'BMI',
'BMI_Kategorie', 'Schuhgröße_EU'],
dtype='object')
X = df.drop(["Geschlecht" , "BMI_Kategorie" ] , axis=1)
X
| Alter_Jahre | Körpergröße_cm | Gewicht_kg | BMI | Schuhgröße_EU | |
|---|---|---|---|---|---|
| ID | |||||
| 1 | 13 | 160.2 | 51.1 | 19.9 | 38 |
| 2 | 7 | 128.1 | 23.6 | 14.4 | 31 |
| 3 | 4 | 100.7 | 16.5 | 16.3 | 25 |
| 4 | 11 | 148.9 | 39.5 | 17.8 | 36 |
| 5 | 9 | 129.7 | 30.1 | 17.9 | 31 |
| ... | ... | ... | ... | ... | ... |
| 1496 | 25 | 170.6 | 57.0 | 19.6 | 41 |
| 1497 | 30 | 164.3 | 54.3 | 20.1 | 39 |
| 1498 | 35 | 167.0 | 69.4 | 24.9 | 40 |
| 1499 | 36 | 168.8 | 62.1 | 21.8 | 40 |
| 1500 | 33 | 179.3 | 81.7 | 25.4 | 43 |
1500 rows × 5 columns
X.describe()
| Alter_Jahre | Körpergröße_cm | Gewicht_kg | BMI | Schuhgröße_EU | |
|---|---|---|---|---|---|
| count | 1500.000000 | 1500.000000 | 1500.000000 | 1500.000000 | 1500.000000 |
| mean | 18.227333 | 153.753067 | 52.699200 | 20.692067 | 36.854667 |
| std | 10.560587 | 25.734906 | 23.107748 | 4.288830 | 5.804711 |
| min | 3.000000 | 76.200000 | 7.800000 | 8.500000 | 19.000000 |
| 25% | 9.000000 | 135.700000 | 30.900000 | 17.100000 | 33.000000 |
| 50% | 17.000000 | 162.950000 | 57.700000 | 20.900000 | 39.000000 |
| 75% | 27.000000 | 173.000000 | 71.200000 | 24.000000 | 41.000000 |
| max | 40.000000 | 197.300000 | 111.500000 | 33.100000 | 47.000000 |
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
train_X.shape
(1125, 5)
val_X.shape
(375, 5)
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression() # regression in weiterem Sinn: auch Klassen
reg.fit(train_X, train_y)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
| penalty | 'l2' | |
| dual | False | |
| tol | 0.0001 | |
| C | 1.0 | |
| fit_intercept | True | |
| intercept_scaling | 1 | |
| class_weight | None | |
| random_state | None | |
| solver | 'lbfgs' | |
| max_iter | 100 | |
| multi_class | 'deprecated' | |
| verbose | 0 | |
| warm_start | False | |
| n_jobs | None | |
| l1_ratio | None |
val_X
| Alter_Jahre | Körpergröße_cm | Gewicht_kg | BMI | Schuhgröße_EU | |
|---|---|---|---|---|---|
| ID | |||||
| 472 | 16 | 162.0 | 55.6 | 21.2 | 39 |
| 10 | 13 | 156.3 | 46.9 | 19.2 | 37 |
| 1500 | 33 | 179.3 | 81.7 | 25.4 | 43 |
| 55 | 10 | 147.6 | 31.2 | 14.3 | 35 |
| 1412 | 27 | 184.0 | 75.2 | 22.2 | 44 |
| ... | ... | ... | ... | ... | ... |
| 76 | 5 | 105.4 | 16.3 | 14.7 | 26 |
| 482 | 13 | 159.5 | 56.2 | 22.1 | 38 |
| 1351 | 38 | 169.7 | 68.8 | 23.9 | 40 |
| 749 | 10 | 136.8 | 34.2 | 18.3 | 33 |
| 1363 | 35 | 167.0 | 77.3 | 27.7 | 40 |
375 rows × 5 columns
predict_y = reg.predict( val_X )
len(predict_y), len(val_y)
(375, 375)
predict_y[:10]
array(['w', 'w', 'm', 'm', 'm', 'm', 'm', 'w', 'm', 'm'], dtype=object)
list(val_y[:10])
['w', 'm', 'm', 'm', 'm', 'm', 'm', 'w', 'w', 'm']
"""
differenz = []
for i in range(0,10):
v = list(val_y)[i]
p = list(predict_y)[i]
# print( f"val: {v}, predict: {p}")
differenz.append( abs(v-p) )
differenz
MAE = sum( differenz ) / len( differenz )
MAE
"""
'\ndifferenz = []\nfor i in range(0,10):\n v = list(val_y)[i]\n p = list(predict_y)[i]\n\n # print( f"val: {v}, predict: {p}")\n differenz.append( abs(v-p) )\ndifferenz\nMAE = sum( differenz ) / len( differenz )\nMAE\n'
cm = { "m" : { "m": 0, "w": 0 },
"w": { "m": 0, "w": 0 }}
for i in range(len( val_y )):
v = list(val_y)[i]
p = list(predict_y)[i]
#print(v , p )
cm[ v ][ p ] += 1
cm
{'m': {'m': 105, 'w': 72}, 'w': {'m': 49, 'w': 149}}
Diskussion siehe https://en.wikipedia.org/wiki/Confusion_matrix