House-Sales
Sat 17 May 2025
# !pip install kaggle
# https://medium.com/@pumaline/easy-analysis-of-your-data-and-ml-model-using-evidently-ai-830ef0c1c4fd
# https://xgboost.readthedocs.io/en/latest/python/python_api.html
import pandas as pd
df = pd.read_csv('~/datasets/kaggle/kc_house_data.csv')
df.head()
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
5 rows × 21 columns
df[['grade','view','waterfront']] = df[['grade', 'view', 'waterfront']].astype('object')
df.head()
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
5 rows × 21 columns
features = ['sqft_living','grade', 'sqft_above', 'sqft_living15', 'bathrooms','view','sqft_basement','lat','long','waterfront', 'yr_built','bedrooms']
ref_data = df[:15000]
prod_data = df[15000:]
from sklearn import model_selection
# Delete entry with 33 bedrooms
df = df[df["bedrooms"] != 33]
# Create training and validation set
X_train, X_val, y_train, y_val = model_selection.train_test_split(ref_data[features], ref_data['price'],
test_size=0.2, shuffle=True, random_state=42)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# one-hot encode categorical variables
categorical = ['grade', 'view', 'waterfront']
ohe = OneHotEncoder(handle_unknown = 'ignore')
ohe = ohe.fit(X_train[categorical])
def preprocessing(X, y, ohe):
# Convert grade, view, waterfront to type object
X[['grade','view','waterfront']] = X[['grade','view','waterfront']].astype('object')
# log transform the target varibale
y = np.log1p(y)
# define categorical and numerical varibales
categorical = ['grade', 'view', 'waterfront']
numerical = ['sqft_living', 'sqft_above', 'sqft_living15',
'bathrooms','sqft_basement','lat','long',
'yr_built','bedrooms']
# one-hot encode categorical variables
X_cat = ohe.transform(X[categorical]).toarray()
# define numerical columns
X_num = np.array(X[numerical])
# concatenate numerical and categorical variables
X = np.concatenate([X_cat, X_num], axis=1)
print('Shape after one-hot encoding')
print(f'X shape: {X.shape}')
return X, y
import numpy as np
X_train, y_train = preprocessing(X_train, y_train, ohe)
X_val, y_val = preprocessing(X_val, y_val, ohe)
X_prod, y_prod = preprocessing(prod_data[features], prod_data['price'], ohe)
Shape after one-hot encoding
X shape: (12000, 27)
Shape after one-hot encoding
X shape: (3000, 27)
Shape after one-hot encoding
X shape: (6613, 27)
/tmp/ipykernel_993006/2150309139.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X[['grade','view','waterfront']] = X[['grade','view','waterfront']].astype('object')
# !pip install xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error
# Initialize XGB with objective function
parameters = {"objective": 'reg:squarederror',
"n_estimators": 100,
"verbosity": 0}
model = xgb.XGBRegressor(**parameters)
model.fit(X_train, y_train)
# generate predictions
y_pred_train = model.predict(X_train).reshape(-1,1)
y_pred = model.predict(X_val).reshape(-1,1)
# calculate errors
rmse_train = mean_squared_error(y_pred_train, y_train, squared=False)
rmse_val = mean_squared_error(y_pred, y_val, squared=False)
print(f"rmse training: {rmse_train:.3f}\t rmse validation: {rmse_val:.3f}")
rmse training: 0.114 rmse validation: 0.185
/home/rajaraman/miniconda3/envs/ml3105/lib/python3.10/site-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
warnings.warn(
/home/rajaraman/miniconda3/envs/ml3105/lib/python3.10/site-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
warnings.warn(
X_train_full, y_train_full = preprocessing(ref_data[features], ref_data['price'], ohe)
ref_data['prediction'] = model.predict(X_train_full)
prod_data['prediction'] = model.predict(X_prod)
ref_data['price_log'] = np.log1p(ref_data['price'])
prod_data['price_log'] = np.log1p(prod_data['price'])
Shape after one-hot encoding
X shape: (15000, 27)
/tmp/ipykernel_993006/2150309139.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X[['grade','view','waterfront']] = X[['grade','view','waterfront']].astype('object')
# !pip install evidently==0.1.50.dev0
import evidently
evidently.__version__
'0.1.50.dev0'
# !conda uninstall -y numpy
# !pip install numpy==1.26.3 --force
import numpy
numpy.__version__
'1.26.3'
from evidently.dashboard import Dashboard
from evidently.pipeline.column_mapping import ColumnMapping
# packages for interactive dashboards
from evidently.dashboard.tabs import (
DataDriftTab,
DataQualityTab,
NumTargetDriftTab,
RegressionPerformanceTab
)
# packages for json-profiles
from evidently.model_profile import Profile
from evidently.model_profile.sections import (
DataDriftProfileSection,
DataQualityProfileSection,
NumTargetDriftProfileSection,
RegressionPerformanceProfileSection
)
/home/rajaraman/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/utils/numpy_encoder.py:14: FutureWarning: In the future `np.bool` will be defined as the corresponding NumPy scalar.
((np.bool, np.bool_), bool),
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[23], line 1
----> 1 from evidently.dashboard import Dashboard
2 from evidently.pipeline.column_mapping import ColumnMapping
3 # packages for interactive dashboards
File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/dashboard/__init__.py:4
1 #!/usr/bin/env python
2 # coding: utf-8
----> 4 from .dashboard import Dashboard
File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/dashboard/dashboard.py:19
17 from evidently.pipeline.column_mapping import ColumnMapping
18 from evidently.dashboard.tabs.base_tab import Tab
---> 19 from evidently.utils import NumpyEncoder
22 @dataclasses.dataclass()
23 class TemplateParams:
24 dashboard_id: str
File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/utils/__init__.py:1
----> 1 from .numpy_encoder import NumpyEncoder
File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/utils/numpy_encoder.py:14
3 import numpy as np
4 import pandas as pd
7 _TYPES_MAPPING = (
8 (
9 (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64),
10 int,
11 ),
12 ((np.float_, np.float16, np.float32, np.float64), float),
13 ((np.ndarray,), lambda obj: obj.tolist()),
---> 14 ((np.bool, np.bool_), bool),
15 ((pd.Timestamp, pd.Timedelta), str),
16 ((np.void, type(pd.NaT)), lambda obj: None),
17 )
20 class NumpyEncoder(json.JSONEncoder):
21 """Numpy and Pandas data types to JSON types encoder"""
File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/numpy/__init__.py:324, in __getattr__(attr)
319 warnings.warn(
320 f"In the future `np.{attr}` will be defined as the "
321 "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
323 if attr in __former_attrs__:
--> 324 raise AttributeError(__former_attrs__[attr])
326 if attr == 'testing':
327 import numpy.testing as testing
AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Score: 25
Category: evidentlyai