s-jbusse_protokoll-demo_2025-11-03_2

Contents

s-jbusse_protokoll-demo_2025-11-03_2#

import os
os.getcwd()
'/media/sf_abc123/l/LA_2025_ws/dsci-ml/md'
melbourne_file_path = '/media/sf_abc123/b/KaggleLearn/input/melbourne-housing-snapshot/melb_data.csv'
import pandas as pd
melbourne_data = pd.read_csv(melbourne_file_path) 
melbourne_data.head(2)
Suburb Address Rooms Type Price Method SellerG Date Distance Postcode ... Bathroom Car Landsize BuildingArea YearBuilt CouncilArea Lattitude Longtitude Regionname Propertycount
0 Abbotsford 85 Turner St 2 h 1480000.0 S Biggin 3/12/2016 2.5 3067.0 ... 1.0 1.0 202.0 NaN NaN Yarra -37.7996 144.9984 Northern Metropolitan 4019.0
1 Abbotsford 25 Bloomburg St 2 h 1035000.0 S Biggin 4/02/2016 2.5 3067.0 ... 1.0 0.0 156.0 79.0 1900.0 Yarra -37.8079 144.9934 Northern Metropolitan 4019.0

2 rows × 21 columns

melbourne_data
Suburb Address Rooms Type Price Method SellerG Date Distance Postcode ... Bathroom Car Landsize BuildingArea YearBuilt CouncilArea Lattitude Longtitude Regionname Propertycount
0 Abbotsford 85 Turner St 2 h 1480000.0 S Biggin 3/12/2016 2.5 3067.0 ... 1.0 1.0 202.0 NaN NaN Yarra -37.79960 144.99840 Northern Metropolitan 4019.0
1 Abbotsford 25 Bloomburg St 2 h 1035000.0 S Biggin 4/02/2016 2.5 3067.0 ... 1.0 0.0 156.0 79.0 1900.0 Yarra -37.80790 144.99340 Northern Metropolitan 4019.0
2 Abbotsford 5 Charles St 3 h 1465000.0 SP Biggin 4/03/2017 2.5 3067.0 ... 2.0 0.0 134.0 150.0 1900.0 Yarra -37.80930 144.99440 Northern Metropolitan 4019.0
3 Abbotsford 40 Federation La 3 h 850000.0 PI Biggin 4/03/2017 2.5 3067.0 ... 2.0 1.0 94.0 NaN NaN Yarra -37.79690 144.99690 Northern Metropolitan 4019.0
4 Abbotsford 55a Park St 4 h 1600000.0 VB Nelson 4/06/2016 2.5 3067.0 ... 1.0 2.0 120.0 142.0 2014.0 Yarra -37.80720 144.99410 Northern Metropolitan 4019.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13575 Wheelers Hill 12 Strada Cr 4 h 1245000.0 S Barry 26/08/2017 16.7 3150.0 ... 2.0 2.0 652.0 NaN 1981.0 NaN -37.90562 145.16761 South-Eastern Metropolitan 7392.0
13576 Williamstown 77 Merrett Dr 3 h 1031000.0 SP Williams 26/08/2017 6.8 3016.0 ... 2.0 2.0 333.0 133.0 1995.0 NaN -37.85927 144.87904 Western Metropolitan 6380.0
13577 Williamstown 83 Power St 3 h 1170000.0 S Raine 26/08/2017 6.8 3016.0 ... 2.0 4.0 436.0 NaN 1997.0 NaN -37.85274 144.88738 Western Metropolitan 6380.0
13578 Williamstown 96 Verdon St 4 h 2500000.0 PI Sweeney 26/08/2017 6.8 3016.0 ... 1.0 5.0 866.0 157.0 1920.0 NaN -37.85908 144.89299 Western Metropolitan 6380.0
13579 Yarraville 6 Agnes St 4 h 1285000.0 SP Village 26/08/2017 6.3 3013.0 ... 1.0 1.0 362.0 112.0 1920.0 NaN -37.81188 144.88449 Western Metropolitan 6543.0

13580 rows × 21 columns

melbourne_data.describe(include="all")
Suburb Address Rooms Type Price Method SellerG Date Distance Postcode ... Bathroom Car Landsize BuildingArea YearBuilt CouncilArea Lattitude Longtitude Regionname Propertycount
count 13580 13580 13580.000000 13580 1.358000e+04 13580 13580 13580 13580.000000 13580.000000 ... 13580.000000 13518.000000 13580.000000 7130.000000 8205.000000 12211 13580.000000 13580.000000 13580 13580.000000
unique 314 13378 NaN 3 NaN 5 268 58 NaN NaN ... NaN NaN NaN NaN NaN 33 NaN NaN 8 NaN
top Reservoir 5 Margaret St NaN h NaN S Nelson 27/05/2017 NaN NaN ... NaN NaN NaN NaN NaN Moreland NaN NaN Southern Metropolitan NaN
freq 359 3 NaN 9449 NaN 9022 1565 473 NaN NaN ... NaN NaN NaN NaN NaN 1163 NaN NaN 4695 NaN
mean NaN NaN 2.937997 NaN 1.075684e+06 NaN NaN NaN 10.137776 3105.301915 ... 1.534242 1.610075 558.416127 151.967650 1964.684217 NaN -37.809203 144.995216 NaN 7454.417378
std NaN NaN 0.955748 NaN 6.393107e+05 NaN NaN NaN 5.868725 90.676964 ... 0.691712 0.962634 3990.669241 541.014538 37.273762 NaN 0.079260 0.103916 NaN 4378.581772
min NaN NaN 1.000000 NaN 8.500000e+04 NaN NaN NaN 0.000000 3000.000000 ... 0.000000 0.000000 0.000000 0.000000 1196.000000 NaN -38.182550 144.431810 NaN 249.000000
25% NaN NaN 2.000000 NaN 6.500000e+05 NaN NaN NaN 6.100000 3044.000000 ... 1.000000 1.000000 177.000000 93.000000 1940.000000 NaN -37.856822 144.929600 NaN 4380.000000
50% NaN NaN 3.000000 NaN 9.030000e+05 NaN NaN NaN 9.200000 3084.000000 ... 1.000000 2.000000 440.000000 126.000000 1970.000000 NaN -37.802355 145.000100 NaN 6555.000000
75% NaN NaN 3.000000 NaN 1.330000e+06 NaN NaN NaN 13.000000 3148.000000 ... 2.000000 2.000000 651.000000 174.000000 1999.000000 NaN -37.756400 145.058305 NaN 10331.000000
max NaN NaN 10.000000 NaN 9.000000e+06 NaN NaN NaN 48.100000 3977.000000 ... 8.000000 10.000000 433014.000000 44515.000000 2018.000000 NaN -37.408530 145.526350 NaN 21650.000000

11 rows × 21 columns

melbourne_data = melbourne_data.dropna(axis=0)
melbourne_data.describe()
Rooms Price Distance Postcode Bedroom2 Bathroom Car Landsize BuildingArea YearBuilt Lattitude Longtitude Propertycount
count 6196.000000 6.196000e+03 6196.000000 6196.000000 6196.000000 6196.000000 6196.000000 6196.000000 6196.000000 6196.000000 6196.000000 6196.000000 6196.000000
mean 2.931407 1.068828e+06 9.751097 3101.947708 2.902034 1.576340 1.573596 471.006940 141.568645 1964.081988 -37.807904 144.990201 7435.489509
std 0.971079 6.751564e+05 5.612065 86.421604 0.970055 0.711362 0.929947 897.449881 90.834824 38.105673 0.075850 0.099165 4337.698917
min 1.000000 1.310000e+05 0.000000 3000.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1196.000000 -38.164920 144.542370 389.000000
25% 2.000000 6.200000e+05 5.900000 3044.000000 2.000000 1.000000 1.000000 152.000000 91.000000 1940.000000 -37.855438 144.926198 4383.750000
50% 3.000000 8.800000e+05 9.000000 3081.000000 3.000000 1.000000 1.000000 373.000000 124.000000 1970.000000 -37.802250 144.995800 6567.000000
75% 4.000000 1.325000e+06 12.400000 3147.000000 3.000000 2.000000 2.000000 628.000000 170.000000 2000.000000 -37.758200 145.052700 10175.000000
max 8.000000 9.000000e+06 47.400000 3977.000000 9.000000 8.000000 10.000000 37000.000000 3112.000000 2018.000000 -37.457090 145.526350 21650.000000
y = melbourne_data.Price
type(y)
pandas.core.series.Series
#list(y)
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
X.describe()
Rooms Bathroom Landsize Lattitude Longtitude
count 6196.000000 6196.000000 6196.000000 6196.000000 6196.000000
mean 2.931407 1.576340 471.006940 -37.807904 144.990201
std 0.971079 0.711362 897.449881 0.075850 0.099165
min 1.000000 1.000000 0.000000 -38.164920 144.542370
25% 2.000000 1.000000 152.000000 -37.855438 144.926198
50% 3.000000 1.000000 373.000000 -37.802250 144.995800
75% 4.000000 2.000000 628.000000 -37.758200 145.052700
max 8.000000 8.000000 37000.000000 -37.457090 145.526350
X.head()
Rooms Bathroom Landsize Lattitude Longtitude
1 2 1.0 156.0 -37.8079 144.9934
2 3 2.0 134.0 -37.8093 144.9944
4 4 1.0 120.0 -37.8072 144.9941
6 3 2.0 245.0 -37.8024 144.9993
7 2 1.0 256.0 -37.8060 144.9954
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X, y)
DecisionTreeRegressor(random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))
Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]
predicted_home_prices = melbourne_model.predict(X)
len(predicted_home_prices)
6196
len(y)
6196
predicted_home_prices[0:10]
array([1035000., 1465000., 1600000., 1876000., 1636000., 1097000.,
       1350000.,  750000., 1310000., 1200000.])
y[0:10]
1     1035000.0
2     1465000.0
4     1600000.0
6     1876000.0
7     1636000.0
9     1097000.0
11    1350000.0
12     750000.0
15    1310000.0
16    1200000.0
Name: Price, dtype: float64
a = [ 1,   3,   4]
b = [ 1.2, 3.1, 3.9 ]

differenz = [ abs( a[i] - b[i] )   for i in range(len(a)) ]
differenz
[0.19999999999999996, 0.10000000000000009, 0.10000000000000009]
mae = sum(differenz) / len( differenz)
yy = list(y)
differenz = [ abs( predicted_home_prices[i] - yy[i] )   for i in range(len(predicted_home_prices)) ]
mae = sum(differenz) / len( differenz)
mae
np.float64(1115.74671831289)

train test split#

from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
train_X.describe(), val_X.describe(), len(train_y), len(val_y)
(             Rooms     Bathroom      Landsize    Lattitude   Longtitude
 count  4647.000000  4647.000000   4647.000000  4647.000000  4647.000000
 mean      2.942113     1.577147    478.649451   -37.808104   144.990970
 std       0.972881     0.713032    963.944034     0.076552     0.099908
 min       1.000000     1.000000      0.000000   -38.164920   144.542370
 25%       2.000000     1.000000    158.000000   -37.855890   144.926290
 50%       3.000000     1.000000    375.000000   -37.801300   144.996100
 75%       4.000000     2.000000    629.000000   -37.758000   145.053850
 max       8.000000     8.000000  37000.000000   -37.483810   145.526350,
              Rooms     Bathroom      Landsize    Lattitude   Longtitude
 count  1549.000000  1549.000000   1549.000000  1549.000000  1549.000000
 mean      2.899290     1.573919    448.079406   -37.807306   144.987893
 std       0.965252     0.706552    658.604496     0.073725     0.096896
 min       1.000000     1.000000      0.000000   -38.161460   144.551060
 25%       2.000000     1.000000    141.000000   -37.854000   144.925600
 50%       3.000000     1.000000    367.000000   -37.804730   144.995300
 75%       3.000000     2.000000    625.000000   -37.758700   145.050400
 max       6.000000     5.000000  15100.000000   -37.457090   145.359510,
 4647,
 1549)
mein_modell = DecisionTreeRegressor()
mein_modell.fit(train_X, train_y)
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
predict_y = mein_modell.predict(val_X)
len(predict_y), len(val_y)
(1549, 1549)
predict_yy = list(predict_y)
val_yy = list(val_y)

differenz = [ abs( predict_yy[i] - val_yy[i] )   for i in range(len( predict_yy )) ]
mae = sum(differenz) / len( differenz)
mae
np.float64(271351.9610501399)