Skizze Wollknäuel#
Mathilda Musterfrau
s-mmuster@haw…
MatNr: 12 34 567
Problemstellungen#
Beschrieben unter http://jbusse.de/dsci-ml_ws2022/Studienarbeit-SS-2023.html:
abschätzen Körpergröße
abschätzen Geschlecht
meine_Datei = "../data/MaennerFrauenKnaeuel.csv"
EDA Explorative Datenanalyse#
import pandas as pd
df = pd.read_csv(meine_Datei, sep=";")
df.head()
| Unnamed: 0 | age | height | spezies | |
|---|---|---|---|---|
| 0 | 0 | 0.0 | 60 | b |
| 1 | 1 | 5.5 | 88 | b |
| 2 | 2 | 13.8 | 0 | b |
| 3 | 3 | 4.1 | 91 | b |
| 4 | 4 | 13.8 | 165 | b |
df.describe()
| Unnamed: 0 | age | height | |
|---|---|---|---|
| count | 1260.000000 | 1260.000000 | 1260.000000 |
| mean | 629.500000 | 25.519444 | 105.180952 |
| std | 363.874979 | 23.836270 | 70.534624 |
| min | 0.000000 | 0.000000 | -6.000000 |
| 25% | 314.750000 | 2.900000 | 30.000000 |
| 50% | 629.500000 | 17.800000 | 127.000000 |
| 75% | 944.250000 | 44.600000 | 169.000000 |
| max | 1259.000000 | 80.000000 | 208.000000 |
df.shape
(1260, 4)
df.columns
Index(['Unnamed: 0', 'age', 'height', 'spezies'], dtype='object')
df.spezies.unique()
array(['b', 'g', 'm', 'M', 'w', 'F', 'K'], dtype=object)
Problem 1: Abschätzen Körpergröße#
y = df.pop("height")
y
0 60
1 88
2 0
3 91
4 165
...
1255 104
1256 208
1257 112
1258 0
1259 140
Name: height, Length: 1260, dtype: int64
X = df
X
| Unnamed: 0 | age | spezies | |
|---|---|---|---|
| 0 | 0 | 0.0 | b |
| 1 | 1 | 5.5 | b |
| 2 | 2 | 13.8 | b |
| 3 | 3 | 4.1 | b |
| 4 | 4 | 13.8 | b |
| ... | ... | ... | ... |
| 1255 | 1255 | 28.1 | K |
| 1256 | 1256 | 3.2 | K |
| 1257 | 1257 | 0.0 | K |
| 1258 | 1258 | 29.9 | K |
| 1259 | 1259 | 12.9 | K |
1260 rows × 3 columns
from sklearn.tree import DecisionTreeRegressor
# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)
# Fit model
melbourne_model.fit(X, y)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_13285/839580878.py in ?()
3 # Define model. Specify a number for random_state to ensure same results each run
4 melbourne_model = DecisionTreeRegressor(random_state=1)
5
6 # Fit model
----> 7 melbourne_model.fit(X, y)
~/miniconda3/lib/python3.13/site-packages/sklearn/base.py in ?(estimator, *args, **kwargs)
1361 skip_parameter_validation=(
1362 prefer_skip_nested_validation or global_skip_validation
1363 )
1364 ):
-> 1365 return fit_method(estimator, *args, **kwargs)
~/miniconda3/lib/python3.13/site-packages/sklearn/tree/_classes.py in ?(self, X, y, sample_weight, check_input)
1400 self : DecisionTreeRegressor
1401 Fitted estimator.
1402 """
1403
-> 1404 super()._fit(
1405 X,
1406 y,
1407 sample_weight=sample_weight,
~/miniconda3/lib/python3.13/site-packages/sklearn/tree/_classes.py in ?(self, X, y, sample_weight, check_input, missing_values_in_feature_mask)
248 check_X_params = dict(
249 dtype=DTYPE, accept_sparse="csc", ensure_all_finite=False
250 )
251 check_y_params = dict(ensure_2d=False, dtype=None)
--> 252 X, y = validate_data(
253 self, X, y, validate_separately=(check_X_params, check_y_params)
254 )
255
~/miniconda3/lib/python3.13/site-packages/sklearn/utils/validation.py in ?(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)
2962 # :(
2963 check_X_params, check_y_params = validate_separately
2964 if "estimator" not in check_X_params:
2965 check_X_params = {**default_check_params, **check_X_params}
-> 2966 X = check_array(X, input_name="X", **check_X_params)
2967 if "estimator" not in check_y_params:
2968 check_y_params = {**default_check_params, **check_y_params}
2969 y = check_array(y, input_name="y", **check_y_params)
~/miniconda3/lib/python3.13/site-packages/sklearn/utils/validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_non_negative, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
1050 )
1051 array = xp.astype(array, dtype, copy=False)
1052 else:
1053 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
-> 1054 except ComplexWarning as complex_warning:
1055 raise ValueError(
1056 "Complex data not supported\n{}\n".format(array)
1057 ) from complex_warning
~/miniconda3/lib/python3.13/site-packages/sklearn/utils/_array_api.py in ?(array, dtype, order, copy, xp, device)
753 # Use NumPy API to support order
754 if copy is True:
755 array = numpy.array(array, order=order, dtype=dtype)
756 else:
--> 757 array = numpy.asarray(array, order=order, dtype=dtype)
758
759 # At this point array is a NumPy ndarray. We convert it to an array
760 # container that is consistent with the input's namespace.
~/miniconda3/lib/python3.13/site-packages/pandas/core/generic.py in ?(self, dtype, copy)
2164 )
2165 values = self._values
2166 if copy is None:
2167 # Note: branch avoids `copy=None` for NumPy 1.x support
-> 2168 arr = np.asarray(values, dtype=dtype)
2169 else:
2170 arr = np.array(values, dtype=dtype, copy=copy)
2171
ValueError: could not convert string to float: 'b'