Skizze Wollknäuel#

  • Mathilda Musterfrau

  • s-mmuster@haw…

  • MatNr: 12 34 567

Problemstellungen#

Beschrieben unter http://jbusse.de/dsci-ml_ws2022/Studienarbeit-SS-2023.html:

  • abschätzen Körpergröße

  • abschätzen Geschlecht

meine_Datei = "../data/MaennerFrauenKnaeuel.csv"

EDA Explorative Datenanalyse#

import pandas as pd
df = pd.read_csv(meine_Datei, sep=";")
df.head()
Unnamed: 0 age height spezies
0 0 0.0 60 b
1 1 5.5 88 b
2 2 13.8 0 b
3 3 4.1 91 b
4 4 13.8 165 b
df.describe()
Unnamed: 0 age height
count 1260.000000 1260.000000 1260.000000
mean 629.500000 25.519444 105.180952
std 363.874979 23.836270 70.534624
min 0.000000 0.000000 -6.000000
25% 314.750000 2.900000 30.000000
50% 629.500000 17.800000 127.000000
75% 944.250000 44.600000 169.000000
max 1259.000000 80.000000 208.000000
df.shape
(1260, 4)
df.columns
Index(['Unnamed: 0', 'age', 'height', 'spezies'], dtype='object')
df.spezies.unique()
array(['b', 'g', 'm', 'M', 'w', 'F', 'K'], dtype=object)

Problem 1: Abschätzen Körpergröße#

y = df.pop("height")
y
0        60
1        88
2         0
3        91
4       165
       ... 
1255    104
1256    208
1257    112
1258      0
1259    140
Name: height, Length: 1260, dtype: int64
X = df
X
Unnamed: 0 age spezies
0 0 0.0 b
1 1 5.5 b
2 2 13.8 b
3 3 4.1 b
4 4 13.8 b
... ... ... ...
1255 1255 28.1 K
1256 1256 3.2 K
1257 1257 0.0 K
1258 1258 29.9 K
1259 1259 12.9 K

1260 rows × 3 columns

from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X, y)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_19461/839580878.py in ?()
      3 # Define model. Specify a number for random_state to ensure same results each run
      4 melbourne_model = DecisionTreeRegressor(random_state=1)
      5 
      6 # Fit model
----> 7 melbourne_model.fit(X, y)

~/miniconda3/lib/python3.12/site-packages/sklearn/base.py in ?(estimator, *args, **kwargs)
   1469                 skip_parameter_validation=(
   1470                     prefer_skip_nested_validation or global_skip_validation
   1471                 )
   1472             ):
-> 1473                 return fit_method(estimator, *args, **kwargs)

~/miniconda3/lib/python3.12/site-packages/sklearn/tree/_classes.py in ?(self, X, y, sample_weight, check_input)
   1373         self : DecisionTreeRegressor
   1374             Fitted estimator.
   1375         """
   1376 
-> 1377         super()._fit(
   1378             X,
   1379             y,
   1380             sample_weight=sample_weight,

~/miniconda3/lib/python3.12/site-packages/sklearn/tree/_classes.py in ?(self, X, y, sample_weight, check_input, missing_values_in_feature_mask)
    248             check_X_params = dict(
    249                 dtype=DTYPE, accept_sparse="csc", force_all_finite=False
    250             )
    251             check_y_params = dict(ensure_2d=False, dtype=None)
--> 252             X, y = self._validate_data(
    253                 X, y, validate_separately=(check_X_params, check_y_params)
    254             )
    255 

~/miniconda3/lib/python3.12/site-packages/sklearn/base.py in ?(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    641                 # :(
    642                 check_X_params, check_y_params = validate_separately
    643                 if "estimator" not in check_X_params:
    644                     check_X_params = {**default_check_params, **check_X_params}
--> 645                 X = check_array(X, input_name="X", **check_X_params)
    646                 if "estimator" not in check_y_params:
    647                     check_y_params = {**default_check_params, **check_y_params}
    648                 y = check_array(y, input_name="y", **check_y_params)

~/miniconda3/lib/python3.12/site-packages/sklearn/utils/validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
   1009                         )
   1010                     array = xp.astype(array, dtype, copy=False)
   1011                 else:
   1012                     array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
-> 1013             except ComplexWarning as complex_warning:
   1014                 raise ValueError(
   1015                     "Complex data not supported\n{}\n".format(array)
   1016                 ) from complex_warning

~/miniconda3/lib/python3.12/site-packages/sklearn/utils/_array_api.py in ?(array, dtype, order, copy, xp, device)
    741         # Use NumPy API to support order
    742         if copy is True:
    743             array = numpy.array(array, order=order, dtype=dtype)
    744         else:
--> 745             array = numpy.asarray(array, order=order, dtype=dtype)
    746 
    747         # At this point array is a NumPy ndarray. We convert it to an array
    748         # container that is consistent with the input's namespace.

~/miniconda3/lib/python3.12/site-packages/pandas/core/generic.py in ?(self, dtype, copy)
   2149     def __array__(
   2150         self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None
   2151     ) -> np.ndarray:
   2152         values = self._values
-> 2153         arr = np.asarray(values, dtype=dtype)
   2154         if (
   2155             astype_is_view(values.dtype, arr.dtype)
   2156             and using_copy_on_write()

ValueError: could not convert string to float: 'b'

Aufgabe 2: Geschlecht abschätzen#