Titnaic-Evi

Sat 17 May 2025

# https://github.com/datasciencedojo/datasets/blob/master/titanic.csv
import pandas as pd
!ls ../titanic.csv
../titanic.csv
# Load the dataset
data = pd.read_csv('../titanic.csv')
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
data.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris 0 22.0 1 0 A/5 21171 7.2500 Unknown S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina 1 26.0 0 0 STON/O2. 3101282 7.9250 Unknown S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry 0 35.0 0 0 373450 8.0500 Unknown S
# data['Age'].fillna(data['Age'].median(), inplace=True)
data['Age'] = data['Age'].fillna(data['Age'].median())
# Encode categorical variables if necessary
# Example: Convert 'Sex' to numerical values
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
# Initialize the report with the Data Drift preset
report = Report(metrics=[DataDriftPreset()])
data.isnull().sum()
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
data.describe()
PassengerId Survived Pclass Sex Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 0.352413 29.361582 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 0.477990 13.019697 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 0.000000 22.000000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 0.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 1.000000 35.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 1.000000 80.000000 8.000000 6.000000 512.329200
data['Cabin'] = data['Cabin'].fillna('Unknown')
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
data.isnull().sum()
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
data.nunique()
PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          148
Embarked         3
dtype: int64
# Split the data into reference and current datasets
reference_data = data.sample(frac=0.7, random_state=42)  # 70% as reference
current_data = data.drop(reference_data.index)  # Remaining 30% as current
reference_data.describe(include='all')
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
count 624.000000 624.000000 624.000000 624 624.000000 624.000000 624.000000 624.000000 624 624.000000 624 624
unique NaN NaN NaN 624 NaN NaN NaN NaN 509 NaN 113 3
top NaN NaN NaN Greenberg, Mr. Samuel NaN NaN NaN NaN 382652 NaN Unknown S
freq NaN NaN NaN 1 NaN NaN NaN NaN 5 NaN 488 440
mean 438.080128 0.391026 2.314103 NaN 0.360577 29.239984 0.504808 0.400641 NaN 31.846708 NaN NaN
std 258.247396 0.488372 0.831209 NaN 0.480553 13.146780 1.018280 0.815027 NaN 49.422675 NaN NaN
min 1.000000 0.000000 1.000000 NaN 0.000000 0.750000 0.000000 0.000000 NaN 0.000000 NaN NaN
25% 213.750000 0.000000 2.000000 NaN 0.000000 22.000000 0.000000 0.000000 NaN 7.925000 NaN NaN
50% 434.500000 0.000000 3.000000 NaN 0.000000 28.000000 0.000000 0.000000 NaN 14.500000 NaN NaN
75% 658.250000 1.000000 3.000000 NaN 1.000000 35.000000 1.000000 1.000000 NaN 30.178100 NaN NaN
max 891.000000 1.000000 3.000000 NaN 1.000000 80.000000 8.000000 6.000000 NaN 512.329200 NaN NaN
current_data.isnull().sum()
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

reference_data.isnull().sum()
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
# Run the report
report.run(reference_data=reference_data, current_data=current_data)
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
# Initialize the report with the Data Drift preset
report = Report(metrics=[DataDriftPreset()])
# Run the report
report.run(reference_data=reference_data, current_data=current_data)
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
  terms = (f_obs_float - f_exp)**2 / f_exp
# Save the report to an HTML file
report.save_html('titanic_data_drift_report.html')

reference_data.info()
<class 'pandas.core.frame.DataFrame'>
Index: 624 entries, 709 to 714
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  624 non-null    int64  
 1   Survived     624 non-null    int64  
 2   Pclass       624 non-null    int64  
 3   Name         624 non-null    object 
 4   Sex          624 non-null    int64  
 5   Age          624 non-null    float64
 6   SibSp        624 non-null    int64  
 7   Parch        624 non-null    int64  
 8   Ticket       624 non-null    object 
 9   Fare         624 non-null    float64
 10  Cabin        624 non-null    object 
 11  Embarked     624 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 63.4+ KB
current_data.info()
<class 'pandas.core.frame.DataFrame'>
Index: 267 entries, 1 to 887
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  267 non-null    int64  
 1   Survived     267 non-null    int64  
 2   Pclass       267 non-null    int64  
 3   Name         267 non-null    object 
 4   Sex          267 non-null    int64  
 5   Age          267 non-null    float64
 6   SibSp        267 non-null    int64  
 7   Parch        267 non-null    int64  
 8   Ticket       267 non-null    object 
 9   Fare         267 non-null    float64
 10  Cabin        267 non-null    object 
 11  Embarked     267 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 27.1+ KB



Score: 30

Category: evidentlyai