Schuhgröße: MAE “zu Fuß”#
import pandas as pd
datei = "synthetische_daten_3bis40_mit_BMI_Kategorien.csv"
#with open(datei) as f:
# mein_text = f.read()
#mein_text.split("\n")
df = pd.read_csv(datei)
df.head()
| ID | Alter_Jahre | Geschlecht | Körpergröße_cm | Gewicht_kg | BMI | BMI_Kategorie | Schuhgröße_EU | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 13 | w | 160.2 | 51.1 | 19.9 | Normalgewicht | 38 |
| 1 | 2 | 7 | m | 128.1 | 23.6 | 14.4 | Untergewichtig | 31 |
| 2 | 3 | 4 | w | 100.7 | 16.5 | 16.3 | Normalgewicht | 25 |
| 3 | 4 | 11 | w | 148.9 | 39.5 | 17.8 | Normalgewicht | 36 |
| 4 | 5 | 9 | w | 129.7 | 30.1 | 17.9 | Normalgewicht | 31 |
from sklearn.model_selection import train_test_split
y = df["Schuhgröße_EU"]
type(y)
pandas.core.series.Series
df.columns
Index(['ID', 'Alter_Jahre', 'Geschlecht', 'Körpergröße_cm', 'Gewicht_kg',
'BMI', 'BMI_Kategorie', 'Schuhgröße_EU'],
dtype='object')
#meine_cols = ['ID', 'Alter_Jahre', 'Geschlecht', 'Körpergröße_cm', 'Gewicht_kg',
# 'BMI', 'BMI_Kategorie']
X = df.drop(["Geschlecht" , "Schuhgröße_EU", "BMI_Kategorie" ] , axis=1)
X
| ID | Alter_Jahre | Körpergröße_cm | Gewicht_kg | BMI | |
|---|---|---|---|---|---|
| 0 | 1 | 13 | 160.2 | 51.1 | 19.9 |
| 1 | 2 | 7 | 128.1 | 23.6 | 14.4 |
| 2 | 3 | 4 | 100.7 | 16.5 | 16.3 |
| 3 | 4 | 11 | 148.9 | 39.5 | 17.8 |
| 4 | 5 | 9 | 129.7 | 30.1 | 17.9 |
| ... | ... | ... | ... | ... | ... |
| 1495 | 1496 | 25 | 170.6 | 57.0 | 19.6 |
| 1496 | 1497 | 30 | 164.3 | 54.3 | 20.1 |
| 1497 | 1498 | 35 | 167.0 | 69.4 | 24.9 |
| 1498 | 1499 | 36 | 168.8 | 62.1 | 21.8 |
| 1499 | 1500 | 33 | 179.3 | 81.7 | 25.4 |
1500 rows × 5 columns
X.describe()
| ID | Alter_Jahre | Körpergröße_cm | Gewicht_kg | BMI | |
|---|---|---|---|---|---|
| count | 1500.000000 | 1500.000000 | 1500.000000 | 1500.000000 | 1500.000000 |
| mean | 750.500000 | 18.227333 | 153.753067 | 52.699200 | 20.692067 |
| std | 433.157015 | 10.560587 | 25.734906 | 23.107748 | 4.288830 |
| min | 1.000000 | 3.000000 | 76.200000 | 7.800000 | 8.500000 |
| 25% | 375.750000 | 9.000000 | 135.700000 | 30.900000 | 17.100000 |
| 50% | 750.500000 | 17.000000 | 162.950000 | 57.700000 | 20.900000 |
| 75% | 1125.250000 | 27.000000 | 173.000000 | 71.200000 | 24.000000 |
| max | 1500.000000 | 40.000000 | 197.300000 | 111.500000 | 33.100000 |
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
train_X.shape
(1125, 5)
val_X.shape
(375, 5)
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(train_X, train_y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
| fit_intercept | True | |
| copy_X | True | |
| tol | 1e-06 | |
| n_jobs | None | |
| positive | False |
val_X
| ID | Alter_Jahre | Körpergröße_cm | Gewicht_kg | BMI | |
|---|---|---|---|---|---|
| 471 | 472 | 16 | 162.0 | 55.6 | 21.2 |
| 9 | 10 | 13 | 156.3 | 46.9 | 19.2 |
| 1499 | 1500 | 33 | 179.3 | 81.7 | 25.4 |
| 54 | 55 | 10 | 147.6 | 31.2 | 14.3 |
| 1411 | 1412 | 27 | 184.0 | 75.2 | 22.2 |
| ... | ... | ... | ... | ... | ... |
| 75 | 76 | 5 | 105.4 | 16.3 | 14.7 |
| 481 | 482 | 13 | 159.5 | 56.2 | 22.1 |
| 1350 | 1351 | 38 | 169.7 | 68.8 | 23.9 |
| 748 | 749 | 10 | 136.8 | 34.2 | 18.3 |
| 1362 | 1363 | 35 | 167.0 | 77.3 | 27.7 |
375 rows × 5 columns
predict_y = reg.predict( val_X )
len(predict_y), len(val_y)
(375, 375)
#predict_y[:10]
#val_y[:10]
differenz = []
for i in range(0,10):
v = list(val_y)[i]
p = list(predict_y)[i]
# print( f"val: {v}, predict: {p}")
differenz.append( abs(v-p) )
differenz
[np.float64(0.2780887017681408),
np.float64(0.44371682153970227),
np.float64(0.4054127305193447),
np.float64(0.5014711798481528),
np.float64(0.31780106007500564),
np.float64(0.010436596802485099),
np.float64(0.2955949140213576),
np.float64(0.06754289132536684),
np.float64(0.36750307699773543),
np.float64(0.09606540023445831)]
MAE = sum( differenz ) / len( differenz )
MAE
np.float64(0.278363337313175)