ML Notes - evidentlyai

House-Sales

Fri 14 November 2025

# !pip install kaggle
# https://medium.com/@pumaline/easy-analysis-of-your-data-and-ml-model-using-evidently-ai-830ef0c1c4fd
# https://xgboost.readthedocs.io/en/latest/python/python_api.html

import pandas as pd

df = pd.read_csv('~/datasets/kaggle/kc_house_data.csv')

df.head()

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503

5 rows × 21 columns

df[['grade','view','waterfront']] = df[['grade', 'view', 'waterfront']].astype('object')
df.head()

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503

5 rows × 21 columns

features = ['sqft_living','grade', 'sqft_above', 'sqft_living15', 'bathrooms','view','sqft_basement','lat','long','waterfront', 'yr_built','bedrooms']

ref_data = df[:15000]
prod_data = df[15000:]

from sklearn import model_selection

# Delete entry with 33 bedrooms
df = df[df["bedrooms"] != 33]

# Create training and validation set
X_train, X_val, y_train, y_val = model_selection.train_test_split(ref_data[features], ref_data['price'],
                 test_size=0.2, shuffle=True, random_state=42)

from sklearn.preprocessing import StandardScaler, OneHotEncoder

# one-hot encode categorical variables
categorical = ['grade', 'view', 'waterfront']
ohe = OneHotEncoder(handle_unknown = 'ignore')
ohe = ohe.fit(X_train[categorical])

def preprocessing(X, y, ohe):

    # Convert grade, view, waterfront to type object
    X[['grade','view','waterfront']] = X[['grade','view','waterfront']].astype('object')

    # log transform the target varibale 
    y = np.log1p(y)

    # define categorical and numerical varibales 
    categorical = ['grade', 'view', 'waterfront']
    numerical = ['sqft_living', 'sqft_above', 'sqft_living15',
           'bathrooms','sqft_basement','lat','long',
           'yr_built','bedrooms']

    # one-hot encode categorical variables
    X_cat = ohe.transform(X[categorical]).toarray()

    # define numerical columns 
    X_num = np.array(X[numerical])

    # concatenate numerical and categorical variables
    X = np.concatenate([X_cat, X_num], axis=1)

    print('Shape after one-hot encoding')
    print(f'X shape: {X.shape}')

    return X, y

import numpy as np

X_train, y_train = preprocessing(X_train, y_train, ohe)
X_val, y_val = preprocessing(X_val, y_val, ohe)
X_prod, y_prod = preprocessing(prod_data[features],  prod_data['price'], ohe)

Shape after one-hot encoding
X shape: (12000, 27)
Shape after one-hot encoding
X shape: (3000, 27)
Shape after one-hot encoding
X shape: (6613, 27)


/tmp/ipykernel_993006/2150309139.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['grade','view','waterfront']] = X[['grade','view','waterfront']].astype('object')

# !pip install xgboost

import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Initialize XGB with objective function
parameters = {"objective": 'reg:squarederror',
              "n_estimators": 100,
              "verbosity": 0}

model = xgb.XGBRegressor(**parameters)
model.fit(X_train, y_train)

# generate predictions
y_pred_train = model.predict(X_train).reshape(-1,1)
y_pred = model.predict(X_val).reshape(-1,1)

# calculate errors
rmse_train = mean_squared_error(y_pred_train, y_train, squared=False)
rmse_val = mean_squared_error(y_pred, y_val, squared=False)
print(f"rmse training: {rmse_train:.3f}\t rmse validation: {rmse_val:.3f}")

rmse training: 0.114     rmse validation: 0.185


/home/rajaraman/miniconda3/envs/ml3105/lib/python3.10/site-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/home/rajaraman/miniconda3/envs/ml3105/lib/python3.10/site-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(

X_train_full, y_train_full = preprocessing(ref_data[features], ref_data['price'], ohe)
ref_data['prediction'] = model.predict(X_train_full)
prod_data['prediction'] = model.predict(X_prod)
ref_data['price_log'] = np.log1p(ref_data['price'])
prod_data['price_log'] = np.log1p(prod_data['price'])

Shape after one-hot encoding
X shape: (15000, 27)


/tmp/ipykernel_993006/2150309139.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['grade','view','waterfront']] = X[['grade','view','waterfront']].astype('object')

# !pip install evidently==0.1.50.dev0

import evidently

evidently.__version__

'0.1.50.dev0'

# !conda uninstall -y numpy

# !pip install numpy==1.26.3 --force
import numpy
numpy.__version__

'1.26.3'

from evidently.dashboard import Dashboard
from evidently.pipeline.column_mapping import ColumnMapping
# packages for interactive dashboards
from evidently.dashboard.tabs import (
     DataDriftTab, 
     DataQualityTab, 
     NumTargetDriftTab, 
     RegressionPerformanceTab
    )
# packages for json-profiles
from evidently.model_profile import Profile
from evidently.model_profile.sections import (  
     DataDriftProfileSection, 
     DataQualityProfileSection, 
     NumTargetDriftProfileSection,
     RegressionPerformanceProfileSection
)

/home/rajaraman/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/utils/numpy_encoder.py:14: FutureWarning: In the future `np.bool` will be defined as the corresponding NumPy scalar.
  ((np.bool, np.bool_), bool),



---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

Cell In[23], line 1
----> 1 from evidently.dashboard import Dashboard
      2 from evidently.pipeline.column_mapping import ColumnMapping
      3 # packages for interactive dashboards


File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/dashboard/__init__.py:4
      1 #!/usr/bin/env python
      2 # coding: utf-8
----> 4 from .dashboard import Dashboard


File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/dashboard/dashboard.py:19
     17 from evidently.pipeline.column_mapping import ColumnMapping
     18 from evidently.dashboard.tabs.base_tab import Tab
---> 19 from evidently.utils import NumpyEncoder
     22 @dataclasses.dataclass()
     23 class TemplateParams:
     24     dashboard_id: str


File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/utils/__init__.py:1
----> 1 from .numpy_encoder import NumpyEncoder


File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/utils/numpy_encoder.py:14
      3 import numpy as np
      4 import pandas as pd
      7 _TYPES_MAPPING = (
      8     (
      9         (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64),
     10         int,
     11     ),
     12     ((np.float_, np.float16, np.float32, np.float64), float),
     13     ((np.ndarray,), lambda obj: obj.tolist()),
---> 14     ((np.bool, np.bool_), bool),
     15     ((pd.Timestamp, pd.Timedelta), str),
     16     ((np.void, type(pd.NaT)), lambda obj: None),
     17 )
     20 class NumpyEncoder(json.JSONEncoder):
     21     """Numpy and Pandas data types to JSON types encoder"""


File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/numpy/__init__.py:324, in __getattr__(attr)
    319     warnings.warn(
    320         f"In the future `np.{attr}` will be defined as the "
    321         "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
    323 if attr in __former_attrs__:
--> 324     raise AttributeError(__former_attrs__[attr])
    326 if attr == 'testing':
    327     import numpy.testing as testing


AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

Score: 25

Category: evidentlyai

Titnaic-Evi

Fri 14 November 2025

# https://github.com/datasciencedojo/datasets/blob/master/titanic.csv

import pandas as pd

!ls ../titanic.csv

../titanic.csv

# Load the dataset
data = pd.read_csv('../titanic.csv')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0 …

Category: evidentlyai

House-Sales

Fri 14 November 2025

Titnaic-Evi

Fri 14 November 2025

Page 1 of 1