Schuhgröße – Geschlecht

Schuhgröße – Geschlecht#

import pandas as pd
datei = "synthetische_daten_3bis40_mit_BMI_Kategorien.csv"
df = pd.read_csv(datei, index_col="ID")
df.head()
Alter_Jahre Geschlecht Körpergröße_cm Gewicht_kg BMI BMI_Kategorie Schuhgröße_EU
ID
1 20 w 165.6 58.1 21.2 Normalgewicht 40
2 4 m 101.1 11.8 11.5 Untergewichtig 25
3 12 m 147.5 35.7 16.4 Normalgewicht 35
4 19 m 177.4 68.0 21.6 Normalgewicht 42
5 6 m 122.0 22.8 15.3 Normalgewicht 30
from sklearn.model_selection import train_test_split
y = df["Geschlecht"]
type(y)
pandas.core.series.Series
df.columns
Index(['Alter_Jahre', 'Geschlecht', 'Körpergröße_cm', 'Gewicht_kg', 'BMI',
       'BMI_Kategorie', 'Schuhgröße_EU'],
      dtype='object')
X = df.drop(["Geschlecht" , "BMI_Kategorie" ] , axis=1)
X
Alter_Jahre Körpergröße_cm Gewicht_kg BMI Schuhgröße_EU
ID
1 20 165.6 58.1 21.2 40
2 4 101.1 11.8 11.5 25
3 12 147.5 35.7 16.4 35
4 19 177.4 68.0 21.6 42
5 6 122.0 22.8 15.3 30
... ... ... ... ... ...
1496 22 189.3 88.9 24.8 45
1497 35 166.1 62.6 22.7 40
1498 28 165.2 65.5 24.0 39
1499 39 172.3 68.0 22.9 41
1500 32 180.6 82.8 25.4 43

1500 rows × 5 columns

X.describe()
Alter_Jahre Körpergröße_cm Gewicht_kg BMI Schuhgröße_EU
count 1500.000000 1500.000000 1500.000000 1500.000000 1500.000000
mean 18.564667 154.977533 53.709733 20.777133 37.106667
std 10.557531 25.751946 23.149581 4.299159 5.803138
min 3.000000 81.300000 7.000000 9.500000 21.000000
25% 10.000000 138.500000 31.975000 17.400000 33.000000
50% 17.000000 164.700000 58.900000 21.100000 39.000000
75% 27.000000 173.800000 71.150000 23.800000 41.000000
max 40.000000 194.400000 110.300000 33.500000 46.000000
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
train_X.shape
(1125, 5)
val_X.shape
(375, 5)
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression() # regression in weiterem Sinn: auch Klassen
reg.fit(train_X, train_y)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
val_X
Alter_Jahre Körpergröße_cm Gewicht_kg BMI Schuhgröße_EU
ID
472 15 170.9 65.1 22.3 41
10 8 125.7 21.0 13.3 31
1500 32 180.6 82.8 25.4 43
55 3 102.8 16.1 15.2 25
1412 37 165.5 63.5 23.2 39
... ... ... ... ... ...
76 4 105.5 20.3 18.2 26
482 6 107.9 18.9 16.2 27
1351 22 179.9 85.1 26.3 43
749 7 122.4 20.7 13.8 30
1363 22 187.7 88.4 25.1 44

375 rows × 5 columns

predict_y = reg.predict( val_X ) 
len(predict_y), len(val_y)
(375, 375)
predict_y[:10]
array(['m', 'm', 'm', 'm', 'w', 'm', 'm', 'w', 'm', 'w'], dtype=object)
list(val_y[:10])
['m', 'm', 'm', 'w', 'w', 'm', 'm', 'w', 'm', 'w']
"""
differenz = []
for i in range(0,10):
    v = list(val_y)[i]
    p = list(predict_y)[i]
    
    # print( f"val: {v}, predict: {p}")
    differenz.append( abs(v-p) )
differenz
MAE = sum( differenz ) / len( differenz )
MAE
"""
'\ndifferenz = []\nfor i in range(0,10):\n    v = list(val_y)[i]\n    p = list(predict_y)[i]\n\n    # print( f"val: {v}, predict: {p}")\n    differenz.append( abs(v-p) )\ndifferenz\nMAE = sum( differenz ) / len( differenz )\nMAE\n'
cm = { "m" : { "m": 0, "w": 0 },
       "w":  { "m": 0, "w": 0 }}
for i in range(len( val_y )):

    v = list(val_y)[i]
    p = list(predict_y)[i]
    
    #print(v ,  p )
    cm[ v ][ p ] += 1
cm
{'m': {'m': 136, 'w': 70}, 'w': {'m': 48, 'w': 121}}

Diskussion siehe https://en.wikipedia.org/wiki/Confusion_matrix