House-Sales

Sat 17 May 2025
# !pip install kaggle
# https://medium.com/@pumaline/easy-analysis-of-your-data-and-ml-model-using-evidently-ai-830ef0c1c4fd
# https://xgboost.readthedocs.io/en/latest/python/python_api.html
import pandas as pd
df = pd.read_csv('~/datasets/kaggle/kc_house_data.csv')
df.head()
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503

5 rows × 21 columns

df[['grade','view','waterfront']] = df[['grade', 'view', 'waterfront']].astype('object')
df.head()
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503

5 rows × 21 columns

features = ['sqft_living','grade', 'sqft_above', 'sqft_living15', 'bathrooms','view','sqft_basement','lat','long','waterfront', 'yr_built','bedrooms']
ref_data = df[:15000]
prod_data = df[15000:]
from sklearn import model_selection
# Delete entry with 33 bedrooms
df = df[df["bedrooms"] != 33]

# Create training and validation set
X_train, X_val, y_train, y_val = model_selection.train_test_split(ref_data[features], ref_data['price'],
                 test_size=0.2, shuffle=True, random_state=42)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# one-hot encode categorical variables
categorical = ['grade', 'view', 'waterfront']
ohe = OneHotEncoder(handle_unknown = 'ignore')
ohe = ohe.fit(X_train[categorical])
def preprocessing(X, y, ohe):

    # Convert grade, view, waterfront to type object
    X[['grade','view','waterfront']] = X[['grade','view','waterfront']].astype('object')

    # log transform the target varibale 
    y = np.log1p(y)

    # define categorical and numerical varibales 
    categorical = ['grade', 'view', 'waterfront']
    numerical = ['sqft_living', 'sqft_above', 'sqft_living15',
           'bathrooms','sqft_basement','lat','long',
           'yr_built','bedrooms']

    # one-hot encode categorical variables
    X_cat = ohe.transform(X[categorical]).toarray()

    # define numerical columns 
    X_num = np.array(X[numerical])

    # concatenate numerical and categorical variables
    X = np.concatenate([X_cat, X_num], axis=1)

    print('Shape after one-hot encoding')
    print(f'X shape: {X.shape}')

    return X, y
import numpy as np
X_train, y_train = preprocessing(X_train, y_train, ohe)
X_val, y_val = preprocessing(X_val, y_val, ohe)
X_prod, y_prod = preprocessing(prod_data[features],  prod_data['price'], ohe)
Shape after one-hot encoding
X shape: (12000, 27)
Shape after one-hot encoding
X shape: (3000, 27)
Shape after one-hot encoding
X shape: (6613, 27)


/tmp/ipykernel_993006/2150309139.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['grade','view','waterfront']] = X[['grade','view','waterfront']].astype('object')
# !pip install xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error
# Initialize XGB with objective function
parameters = {"objective": 'reg:squarederror',
              "n_estimators": 100,
              "verbosity": 0}

model = xgb.XGBRegressor(**parameters)
model.fit(X_train, y_train)

# generate predictions
y_pred_train = model.predict(X_train).reshape(-1,1)
y_pred = model.predict(X_val).reshape(-1,1)

# calculate errors
rmse_train = mean_squared_error(y_pred_train, y_train, squared=False)
rmse_val = mean_squared_error(y_pred, y_val, squared=False)
print(f"rmse training: {rmse_train:.3f}\t rmse validation: {rmse_val:.3f}")
rmse training: 0.114     rmse validation: 0.185


/home/rajaraman/miniconda3/envs/ml3105/lib/python3.10/site-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
/home/rajaraman/miniconda3/envs/ml3105/lib/python3.10/site-packages/sklearn/metrics/_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
X_train_full, y_train_full = preprocessing(ref_data[features], ref_data['price'], ohe)
ref_data['prediction'] = model.predict(X_train_full)
prod_data['prediction'] = model.predict(X_prod)
ref_data['price_log'] = np.log1p(ref_data['price'])
prod_data['price_log'] = np.log1p(prod_data['price'])
Shape after one-hot encoding
X shape: (15000, 27)


/tmp/ipykernel_993006/2150309139.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['grade','view','waterfront']] = X[['grade','view','waterfront']].astype('object')
# !pip install evidently==0.1.50.dev0
import evidently

evidently.__version__
'0.1.50.dev0'
# !conda uninstall -y numpy
# !pip install numpy==1.26.3 --force
import numpy
numpy.__version__
'1.26.3'


from evidently.dashboard import Dashboard
from evidently.pipeline.column_mapping import ColumnMapping
# packages for interactive dashboards
from evidently.dashboard.tabs import (
     DataDriftTab, 
     DataQualityTab, 
     NumTargetDriftTab, 
     RegressionPerformanceTab
    )
# packages for json-profiles
from evidently.model_profile import Profile
from evidently.model_profile.sections import (  
     DataDriftProfileSection, 
     DataQualityProfileSection, 
     NumTargetDriftProfileSection,
     RegressionPerformanceProfileSection
)
/home/rajaraman/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/utils/numpy_encoder.py:14: FutureWarning: In the future `np.bool` will be defined as the corresponding NumPy scalar.
  ((np.bool, np.bool_), bool),



---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

Cell In[23], line 1
----> 1 from evidently.dashboard import Dashboard
      2 from evidently.pipeline.column_mapping import ColumnMapping
      3 # packages for interactive dashboards


File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/dashboard/__init__.py:4
      1 #!/usr/bin/env python
      2 # coding: utf-8
----> 4 from .dashboard import Dashboard


File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/dashboard/dashboard.py:19
     17 from evidently.pipeline.column_mapping import ColumnMapping
     18 from evidently.dashboard.tabs.base_tab import Tab
---> 19 from evidently.utils import NumpyEncoder
     22 @dataclasses.dataclass()
     23 class TemplateParams:
     24     dashboard_id: str


File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/utils/__init__.py:1
----> 1 from .numpy_encoder import NumpyEncoder


File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/evidently/utils/numpy_encoder.py:14
      3 import numpy as np
      4 import pandas as pd
      7 _TYPES_MAPPING = (
      8     (
      9         (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64),
     10         int,
     11     ),
     12     ((np.float_, np.float16, np.float32, np.float64), float),
     13     ((np.ndarray,), lambda obj: obj.tolist()),
---> 14     ((np.bool, np.bool_), bool),
     15     ((pd.Timestamp, pd.Timedelta), str),
     16     ((np.void, type(pd.NaT)), lambda obj: None),
     17 )
     20 class NumpyEncoder(json.JSONEncoder):
     21     """Numpy and Pandas data types to JSON types encoder"""


File ~/miniconda3/envs/ml3105/lib/python3.10/site-packages/numpy/__init__.py:324, in __getattr__(attr)
    319     warnings.warn(
    320         f"In the future `np.{attr}` will be defined as the "
    321         "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
    323 if attr in __former_attrs__:
--> 324     raise AttributeError(__former_attrs__[attr])
    326 if attr == 'testing':
    327     import numpy.testing as testing


AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations




Score: 25

Category: evidentlyai


Titnaic-Evi

Sat 17 May 2025

# https://github.com/datasciencedojo/datasets/blob/master/titanic.csv
import pandas as pd
!ls ../titanic.csv
../titanic.csv
# Load the dataset
data = pd.read_csv('../titanic.csv')
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0 …

Category: evidentlyai

Read More
Page 1 of 1