s-jbusse_protokoll-demo_2025-11-03_2#
import os
os.getcwd()
'/media/sf_abc123/l/LA_2025_ws/dsci-ml/md'
melbourne_file_path = '/media/sf_abc123/b/KaggleLearn/input/melbourne-housing-snapshot/melb_data.csv'
import pandas as pd
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data.head(2)
| Suburb | Address | Rooms | Type | Price | Method | SellerG | Date | Distance | Postcode | ... | Bathroom | Car | Landsize | BuildingArea | YearBuilt | CouncilArea | Lattitude | Longtitude | Regionname | Propertycount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Abbotsford | 85 Turner St | 2 | h | 1480000.0 | S | Biggin | 3/12/2016 | 2.5 | 3067.0 | ... | 1.0 | 1.0 | 202.0 | NaN | NaN | Yarra | -37.7996 | 144.9984 | Northern Metropolitan | 4019.0 |
| 1 | Abbotsford | 25 Bloomburg St | 2 | h | 1035000.0 | S | Biggin | 4/02/2016 | 2.5 | 3067.0 | ... | 1.0 | 0.0 | 156.0 | 79.0 | 1900.0 | Yarra | -37.8079 | 144.9934 | Northern Metropolitan | 4019.0 |
2 rows × 21 columns
melbourne_data
| Suburb | Address | Rooms | Type | Price | Method | SellerG | Date | Distance | Postcode | ... | Bathroom | Car | Landsize | BuildingArea | YearBuilt | CouncilArea | Lattitude | Longtitude | Regionname | Propertycount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Abbotsford | 85 Turner St | 2 | h | 1480000.0 | S | Biggin | 3/12/2016 | 2.5 | 3067.0 | ... | 1.0 | 1.0 | 202.0 | NaN | NaN | Yarra | -37.79960 | 144.99840 | Northern Metropolitan | 4019.0 |
| 1 | Abbotsford | 25 Bloomburg St | 2 | h | 1035000.0 | S | Biggin | 4/02/2016 | 2.5 | 3067.0 | ... | 1.0 | 0.0 | 156.0 | 79.0 | 1900.0 | Yarra | -37.80790 | 144.99340 | Northern Metropolitan | 4019.0 |
| 2 | Abbotsford | 5 Charles St | 3 | h | 1465000.0 | SP | Biggin | 4/03/2017 | 2.5 | 3067.0 | ... | 2.0 | 0.0 | 134.0 | 150.0 | 1900.0 | Yarra | -37.80930 | 144.99440 | Northern Metropolitan | 4019.0 |
| 3 | Abbotsford | 40 Federation La | 3 | h | 850000.0 | PI | Biggin | 4/03/2017 | 2.5 | 3067.0 | ... | 2.0 | 1.0 | 94.0 | NaN | NaN | Yarra | -37.79690 | 144.99690 | Northern Metropolitan | 4019.0 |
| 4 | Abbotsford | 55a Park St | 4 | h | 1600000.0 | VB | Nelson | 4/06/2016 | 2.5 | 3067.0 | ... | 1.0 | 2.0 | 120.0 | 142.0 | 2014.0 | Yarra | -37.80720 | 144.99410 | Northern Metropolitan | 4019.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13575 | Wheelers Hill | 12 Strada Cr | 4 | h | 1245000.0 | S | Barry | 26/08/2017 | 16.7 | 3150.0 | ... | 2.0 | 2.0 | 652.0 | NaN | 1981.0 | NaN | -37.90562 | 145.16761 | South-Eastern Metropolitan | 7392.0 |
| 13576 | Williamstown | 77 Merrett Dr | 3 | h | 1031000.0 | SP | Williams | 26/08/2017 | 6.8 | 3016.0 | ... | 2.0 | 2.0 | 333.0 | 133.0 | 1995.0 | NaN | -37.85927 | 144.87904 | Western Metropolitan | 6380.0 |
| 13577 | Williamstown | 83 Power St | 3 | h | 1170000.0 | S | Raine | 26/08/2017 | 6.8 | 3016.0 | ... | 2.0 | 4.0 | 436.0 | NaN | 1997.0 | NaN | -37.85274 | 144.88738 | Western Metropolitan | 6380.0 |
| 13578 | Williamstown | 96 Verdon St | 4 | h | 2500000.0 | PI | Sweeney | 26/08/2017 | 6.8 | 3016.0 | ... | 1.0 | 5.0 | 866.0 | 157.0 | 1920.0 | NaN | -37.85908 | 144.89299 | Western Metropolitan | 6380.0 |
| 13579 | Yarraville | 6 Agnes St | 4 | h | 1285000.0 | SP | Village | 26/08/2017 | 6.3 | 3013.0 | ... | 1.0 | 1.0 | 362.0 | 112.0 | 1920.0 | NaN | -37.81188 | 144.88449 | Western Metropolitan | 6543.0 |
13580 rows × 21 columns
melbourne_data.describe(include="all")
| Suburb | Address | Rooms | Type | Price | Method | SellerG | Date | Distance | Postcode | ... | Bathroom | Car | Landsize | BuildingArea | YearBuilt | CouncilArea | Lattitude | Longtitude | Regionname | Propertycount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 13580 | 13580 | 13580.000000 | 13580 | 1.358000e+04 | 13580 | 13580 | 13580 | 13580.000000 | 13580.000000 | ... | 13580.000000 | 13518.000000 | 13580.000000 | 7130.000000 | 8205.000000 | 12211 | 13580.000000 | 13580.000000 | 13580 | 13580.000000 |
| unique | 314 | 13378 | NaN | 3 | NaN | 5 | 268 | 58 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 33 | NaN | NaN | 8 | NaN |
| top | Reservoir | 5 Margaret St | NaN | h | NaN | S | Nelson | 27/05/2017 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | Moreland | NaN | NaN | Southern Metropolitan | NaN |
| freq | 359 | 3 | NaN | 9449 | NaN | 9022 | 1565 | 473 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 1163 | NaN | NaN | 4695 | NaN |
| mean | NaN | NaN | 2.937997 | NaN | 1.075684e+06 | NaN | NaN | NaN | 10.137776 | 3105.301915 | ... | 1.534242 | 1.610075 | 558.416127 | 151.967650 | 1964.684217 | NaN | -37.809203 | 144.995216 | NaN | 7454.417378 |
| std | NaN | NaN | 0.955748 | NaN | 6.393107e+05 | NaN | NaN | NaN | 5.868725 | 90.676964 | ... | 0.691712 | 0.962634 | 3990.669241 | 541.014538 | 37.273762 | NaN | 0.079260 | 0.103916 | NaN | 4378.581772 |
| min | NaN | NaN | 1.000000 | NaN | 8.500000e+04 | NaN | NaN | NaN | 0.000000 | 3000.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1196.000000 | NaN | -38.182550 | 144.431810 | NaN | 249.000000 |
| 25% | NaN | NaN | 2.000000 | NaN | 6.500000e+05 | NaN | NaN | NaN | 6.100000 | 3044.000000 | ... | 1.000000 | 1.000000 | 177.000000 | 93.000000 | 1940.000000 | NaN | -37.856822 | 144.929600 | NaN | 4380.000000 |
| 50% | NaN | NaN | 3.000000 | NaN | 9.030000e+05 | NaN | NaN | NaN | 9.200000 | 3084.000000 | ... | 1.000000 | 2.000000 | 440.000000 | 126.000000 | 1970.000000 | NaN | -37.802355 | 145.000100 | NaN | 6555.000000 |
| 75% | NaN | NaN | 3.000000 | NaN | 1.330000e+06 | NaN | NaN | NaN | 13.000000 | 3148.000000 | ... | 2.000000 | 2.000000 | 651.000000 | 174.000000 | 1999.000000 | NaN | -37.756400 | 145.058305 | NaN | 10331.000000 |
| max | NaN | NaN | 10.000000 | NaN | 9.000000e+06 | NaN | NaN | NaN | 48.100000 | 3977.000000 | ... | 8.000000 | 10.000000 | 433014.000000 | 44515.000000 | 2018.000000 | NaN | -37.408530 | 145.526350 | NaN | 21650.000000 |
11 rows × 21 columns
melbourne_data = melbourne_data.dropna(axis=0)
melbourne_data.describe()
| Rooms | Price | Distance | Postcode | Bedroom2 | Bathroom | Car | Landsize | BuildingArea | YearBuilt | Lattitude | Longtitude | Propertycount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6196.000000 | 6.196000e+03 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 |
| mean | 2.931407 | 1.068828e+06 | 9.751097 | 3101.947708 | 2.902034 | 1.576340 | 1.573596 | 471.006940 | 141.568645 | 1964.081988 | -37.807904 | 144.990201 | 7435.489509 |
| std | 0.971079 | 6.751564e+05 | 5.612065 | 86.421604 | 0.970055 | 0.711362 | 0.929947 | 897.449881 | 90.834824 | 38.105673 | 0.075850 | 0.099165 | 4337.698917 |
| min | 1.000000 | 1.310000e+05 | 0.000000 | 3000.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1196.000000 | -38.164920 | 144.542370 | 389.000000 |
| 25% | 2.000000 | 6.200000e+05 | 5.900000 | 3044.000000 | 2.000000 | 1.000000 | 1.000000 | 152.000000 | 91.000000 | 1940.000000 | -37.855438 | 144.926198 | 4383.750000 |
| 50% | 3.000000 | 8.800000e+05 | 9.000000 | 3081.000000 | 3.000000 | 1.000000 | 1.000000 | 373.000000 | 124.000000 | 1970.000000 | -37.802250 | 144.995800 | 6567.000000 |
| 75% | 4.000000 | 1.325000e+06 | 12.400000 | 3147.000000 | 3.000000 | 2.000000 | 2.000000 | 628.000000 | 170.000000 | 2000.000000 | -37.758200 | 145.052700 | 10175.000000 |
| max | 8.000000 | 9.000000e+06 | 47.400000 | 3977.000000 | 9.000000 | 8.000000 | 10.000000 | 37000.000000 | 3112.000000 | 2018.000000 | -37.457090 | 145.526350 | 21650.000000 |
y = melbourne_data.Price
type(y)
pandas.core.series.Series
#list(y)
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
X.describe()
| Rooms | Bathroom | Landsize | Lattitude | Longtitude | |
|---|---|---|---|---|---|
| count | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 |
| mean | 2.931407 | 1.576340 | 471.006940 | -37.807904 | 144.990201 |
| std | 0.971079 | 0.711362 | 897.449881 | 0.075850 | 0.099165 |
| min | 1.000000 | 1.000000 | 0.000000 | -38.164920 | 144.542370 |
| 25% | 2.000000 | 1.000000 | 152.000000 | -37.855438 | 144.926198 |
| 50% | 3.000000 | 1.000000 | 373.000000 | -37.802250 | 144.995800 |
| 75% | 4.000000 | 2.000000 | 628.000000 | -37.758200 | 145.052700 |
| max | 8.000000 | 8.000000 | 37000.000000 | -37.457090 | 145.526350 |
X.head()
| Rooms | Bathroom | Landsize | Lattitude | Longtitude | |
|---|---|---|---|---|---|
| 1 | 2 | 1.0 | 156.0 | -37.8079 | 144.9934 |
| 2 | 3 | 2.0 | 134.0 | -37.8093 | 144.9944 |
| 4 | 4 | 1.0 | 120.0 | -37.8072 | 144.9941 |
| 6 | 3 | 2.0 | 245.0 | -37.8024 | 144.9993 |
| 7 | 2 | 1.0 | 256.0 | -37.8060 | 144.9954 |
from sklearn.tree import DecisionTreeRegressor
# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)
# Fit model
melbourne_model.fit(X, y)
DecisionTreeRegressor(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
| criterion | 'squared_error' | |
| splitter | 'best' | |
| max_depth | None | |
| min_samples_split | 2 | |
| min_samples_leaf | 1 | |
| min_weight_fraction_leaf | 0.0 | |
| max_features | None | |
| random_state | 1 | |
| max_leaf_nodes | None | |
| min_impurity_decrease | 0.0 | |
| ccp_alpha | 0.0 | |
| monotonic_cst | None |
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))
Making predictions for the following 5 houses:
Rooms Bathroom Landsize Lattitude Longtitude
1 2 1.0 156.0 -37.8079 144.9934
2 3 2.0 134.0 -37.8093 144.9944
4 4 1.0 120.0 -37.8072 144.9941
6 3 2.0 245.0 -37.8024 144.9993
7 2 1.0 256.0 -37.8060 144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]
predicted_home_prices = melbourne_model.predict(X)
len(predicted_home_prices)
6196
len(y)
6196
predicted_home_prices[0:10]
array([1035000., 1465000., 1600000., 1876000., 1636000., 1097000.,
1350000., 750000., 1310000., 1200000.])
y[0:10]
1 1035000.0
2 1465000.0
4 1600000.0
6 1876000.0
7 1636000.0
9 1097000.0
11 1350000.0
12 750000.0
15 1310000.0
16 1200000.0
Name: Price, dtype: float64
a = [ 1, 3, 4]
b = [ 1.2, 3.1, 3.9 ]
differenz = [ abs( a[i] - b[i] ) for i in range(len(a)) ]
differenz
[0.19999999999999996, 0.10000000000000009, 0.10000000000000009]
mae = sum(differenz) / len( differenz)
yy = list(y)
differenz = [ abs( predicted_home_prices[i] - yy[i] ) for i in range(len(predicted_home_prices)) ]
mae = sum(differenz) / len( differenz)
mae
np.float64(1115.74671831289)
train test split#
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
train_X.describe(), val_X.describe(), len(train_y), len(val_y)
( Rooms Bathroom Landsize Lattitude Longtitude
count 4647.000000 4647.000000 4647.000000 4647.000000 4647.000000
mean 2.942113 1.577147 478.649451 -37.808104 144.990970
std 0.972881 0.713032 963.944034 0.076552 0.099908
min 1.000000 1.000000 0.000000 -38.164920 144.542370
25% 2.000000 1.000000 158.000000 -37.855890 144.926290
50% 3.000000 1.000000 375.000000 -37.801300 144.996100
75% 4.000000 2.000000 629.000000 -37.758000 145.053850
max 8.000000 8.000000 37000.000000 -37.483810 145.526350,
Rooms Bathroom Landsize Lattitude Longtitude
count 1549.000000 1549.000000 1549.000000 1549.000000 1549.000000
mean 2.899290 1.573919 448.079406 -37.807306 144.987893
std 0.965252 0.706552 658.604496 0.073725 0.096896
min 1.000000 1.000000 0.000000 -38.161460 144.551060
25% 2.000000 1.000000 141.000000 -37.854000 144.925600
50% 3.000000 1.000000 367.000000 -37.804730 144.995300
75% 3.000000 2.000000 625.000000 -37.758700 145.050400
max 6.000000 5.000000 15100.000000 -37.457090 145.359510,
4647,
1549)
mein_modell = DecisionTreeRegressor()
mein_modell.fit(train_X, train_y)
DecisionTreeRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
| criterion | 'squared_error' | |
| splitter | 'best' | |
| max_depth | None | |
| min_samples_split | 2 | |
| min_samples_leaf | 1 | |
| min_weight_fraction_leaf | 0.0 | |
| max_features | None | |
| random_state | None | |
| max_leaf_nodes | None | |
| min_impurity_decrease | 0.0 | |
| ccp_alpha | 0.0 | |
| monotonic_cst | None |
predict_y = mein_modell.predict(val_X)
len(predict_y), len(val_y)
(1549, 1549)
predict_yy = list(predict_y)
val_yy = list(val_y)
differenz = [ abs( predict_yy[i] - val_yy[i] ) for i in range(len( predict_yy )) ]
mae = sum(differenz) / len( differenz)
mae
np.float64(271351.9610501399)