Schuhgröße – Geschlecht

Schuhgröße – Geschlecht#

import pandas as pd
datei = "synthetische_daten_3bis40_mit_BMI_Kategorien.csv"
df = pd.read_csv(datei, index_col="ID")
df.head()
Alter_Jahre Geschlecht Körpergröße_cm Gewicht_kg BMI BMI_Kategorie Schuhgröße_EU
ID
1 13 w 160.2 51.1 19.9 Normalgewicht 38
2 7 m 128.1 23.6 14.4 Untergewichtig 31
3 4 w 100.7 16.5 16.3 Normalgewicht 25
4 11 w 148.9 39.5 17.8 Normalgewicht 36
5 9 w 129.7 30.1 17.9 Normalgewicht 31
from sklearn.model_selection import train_test_split
y = df["Geschlecht"]
type(y)
pandas.core.series.Series
df.columns
Index(['Alter_Jahre', 'Geschlecht', 'Körpergröße_cm', 'Gewicht_kg', 'BMI',
       'BMI_Kategorie', 'Schuhgröße_EU'],
      dtype='object')
X = df.drop(["Geschlecht" , "BMI_Kategorie" ] , axis=1)
X
Alter_Jahre Körpergröße_cm Gewicht_kg BMI Schuhgröße_EU
ID
1 13 160.2 51.1 19.9 38
2 7 128.1 23.6 14.4 31
3 4 100.7 16.5 16.3 25
4 11 148.9 39.5 17.8 36
5 9 129.7 30.1 17.9 31
... ... ... ... ... ...
1496 25 170.6 57.0 19.6 41
1497 30 164.3 54.3 20.1 39
1498 35 167.0 69.4 24.9 40
1499 36 168.8 62.1 21.8 40
1500 33 179.3 81.7 25.4 43

1500 rows × 5 columns

X.describe()
Alter_Jahre Körpergröße_cm Gewicht_kg BMI Schuhgröße_EU
count 1500.000000 1500.000000 1500.000000 1500.000000 1500.000000
mean 18.227333 153.753067 52.699200 20.692067 36.854667
std 10.560587 25.734906 23.107748 4.288830 5.804711
min 3.000000 76.200000 7.800000 8.500000 19.000000
25% 9.000000 135.700000 30.900000 17.100000 33.000000
50% 17.000000 162.950000 57.700000 20.900000 39.000000
75% 27.000000 173.000000 71.200000 24.000000 41.000000
max 40.000000 197.300000 111.500000 33.100000 47.000000
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
train_X.shape
(1125, 5)
val_X.shape
(375, 5)
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression() # regression in weiterem Sinn: auch Klassen
reg.fit(train_X, train_y)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
val_X
Alter_Jahre Körpergröße_cm Gewicht_kg BMI Schuhgröße_EU
ID
472 16 162.0 55.6 21.2 39
10 13 156.3 46.9 19.2 37
1500 33 179.3 81.7 25.4 43
55 10 147.6 31.2 14.3 35
1412 27 184.0 75.2 22.2 44
... ... ... ... ... ...
76 5 105.4 16.3 14.7 26
482 13 159.5 56.2 22.1 38
1351 38 169.7 68.8 23.9 40
749 10 136.8 34.2 18.3 33
1363 35 167.0 77.3 27.7 40

375 rows × 5 columns

predict_y = reg.predict( val_X ) 
len(predict_y), len(val_y)
(375, 375)
predict_y[:10]
array(['w', 'w', 'm', 'm', 'm', 'm', 'm', 'w', 'm', 'm'], dtype=object)
list(val_y[:10])
['w', 'm', 'm', 'm', 'm', 'm', 'm', 'w', 'w', 'm']
"""
differenz = []
for i in range(0,10):
    v = list(val_y)[i]
    p = list(predict_y)[i]
    
    # print( f"val: {v}, predict: {p}")
    differenz.append( abs(v-p) )
differenz
MAE = sum( differenz ) / len( differenz )
MAE
"""
'\ndifferenz = []\nfor i in range(0,10):\n    v = list(val_y)[i]\n    p = list(predict_y)[i]\n\n    # print( f"val: {v}, predict: {p}")\n    differenz.append( abs(v-p) )\ndifferenz\nMAE = sum( differenz ) / len( differenz )\nMAE\n'
cm = { "m" : { "m": 0, "w": 0 },
       "w":  { "m": 0, "w": 0 }}
for i in range(len( val_y )):

    v = list(val_y)[i]
    p = list(predict_y)[i]
    
    #print(v ,  p )
    cm[ v ][ p ] += 1
cm
{'m': {'m': 105, 'w': 72}, 'w': {'m': 49, 'w': 149}}

Diskussion siehe https://en.wikipedia.org/wiki/Confusion_matrix