Schuhgröße: MAE “zu Fuß”

Schuhgröße: MAE “zu Fuß”#

import pandas as pd
datei = "synthetische_daten_3bis40_mit_BMI_Kategorien.csv"
#with open(datei) as f:
#    mein_text = f.read()
#mein_text.split("\n")
df = pd.read_csv(datei)
df.head()
ID Alter_Jahre Geschlecht Körpergröße_cm Gewicht_kg BMI BMI_Kategorie Schuhgröße_EU
0 1 20 w 165.6 58.1 21.2 Normalgewicht 40
1 2 4 m 101.1 11.8 11.5 Untergewichtig 25
2 3 12 m 147.5 35.7 16.4 Normalgewicht 35
3 4 19 m 177.4 68.0 21.6 Normalgewicht 42
4 5 6 m 122.0 22.8 15.3 Normalgewicht 30
from sklearn.model_selection import train_test_split
y = df["Schuhgröße_EU"]
type(y)
pandas.core.series.Series
df.columns
Index(['ID', 'Alter_Jahre', 'Geschlecht', 'Körpergröße_cm', 'Gewicht_kg',
       'BMI', 'BMI_Kategorie', 'Schuhgröße_EU'],
      dtype='object')
#meine_cols = ['ID', 'Alter_Jahre', 'Geschlecht', 'Körpergröße_cm', 'Gewicht_kg',
#       'BMI', 'BMI_Kategorie']
X = df.drop(["Geschlecht" , "Schuhgröße_EU", "BMI_Kategorie" ] , axis=1)
X
ID Alter_Jahre Körpergröße_cm Gewicht_kg BMI
0 1 20 165.6 58.1 21.2
1 2 4 101.1 11.8 11.5
2 3 12 147.5 35.7 16.4
3 4 19 177.4 68.0 21.6
4 5 6 122.0 22.8 15.3
... ... ... ... ... ...
1495 1496 22 189.3 88.9 24.8
1496 1497 35 166.1 62.6 22.7
1497 1498 28 165.2 65.5 24.0
1498 1499 39 172.3 68.0 22.9
1499 1500 32 180.6 82.8 25.4

1500 rows × 5 columns

X.describe()
ID Alter_Jahre Körpergröße_cm Gewicht_kg BMI
count 1500.000000 1500.000000 1500.000000 1500.000000 1500.000000
mean 750.500000 18.564667 154.977533 53.709733 20.777133
std 433.157015 10.557531 25.751946 23.149581 4.299159
min 1.000000 3.000000 81.300000 7.000000 9.500000
25% 375.750000 10.000000 138.500000 31.975000 17.400000
50% 750.500000 17.000000 164.700000 58.900000 21.100000
75% 1125.250000 27.000000 173.800000 71.150000 23.800000
max 1500.000000 40.000000 194.400000 110.300000 33.500000
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
train_X.shape
(1125, 5)
val_X.shape
(375, 5)
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(train_X, train_y)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
val_X
ID Alter_Jahre Körpergröße_cm Gewicht_kg BMI
471 472 15 170.9 65.1 22.3
9 10 8 125.7 21.0 13.3
1499 1500 32 180.6 82.8 25.4
54 55 3 102.8 16.1 15.2
1411 1412 37 165.5 63.5 23.2
... ... ... ... ... ...
75 76 4 105.5 20.3 18.2
481 482 6 107.9 18.9 16.2
1350 1351 22 179.9 85.1 26.3
748 749 7 122.4 20.7 13.8
1362 1363 22 187.7 88.4 25.1

375 rows × 5 columns

predict_y = reg.predict( val_X ) 
len(predict_y), len(val_y)
(375, 375)
#predict_y[:10]
#val_y[:10]
differenz = []

for i in range(0,10):
    v = list(val_y)[i]
    p = list(predict_y)[i]
    
    # print( f"val: {v}, predict: {p}")
    differenz.append( abs(v-p) )
differenz
[np.float64(0.3022913002060861),
 np.float64(0.4896670825697669),
 np.float64(0.1178661698626513),
 np.float64(0.36005438749023),
 np.float64(0.46754436728781457),
 np.float64(0.1530543092586072),
 np.float64(0.30683828995378093),
 np.float64(0.4532737267499307),
 np.float64(0.2105452874775935),
 np.float64(0.04441925830533222)]
MAE = sum( differenz ) / len( differenz )
MAE
np.float64(0.29055541791617934)