# https://github.com/datasciencedojo/datasets/blob/master/titanic.csv
# Load the dataset
data = pd.read_csv('../titanic.csv')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
| 0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
0 |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
Unknown |
S |
| 1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
1 |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
| 2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
1 |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
Unknown |
S |
| 3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
1 |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
| 4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
0 |
35.0 |
0 |
0 |
373450 |
8.0500 |
Unknown |
S |
# data['Age'].fillna(data['Age'].median(), inplace=True)
data['Age'] = data['Age'].fillna(data['Age'].median())
# Encode categorical variables if necessary
# Example: Convert 'Sex' to numerical values
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
# Initialize the report with the Data Drift preset
report = Report(metrics=[DataDriftPreset()])
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
|
PassengerId |
Survived |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
| count |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
891.000000 |
| mean |
446.000000 |
0.383838 |
2.308642 |
0.352413 |
29.361582 |
0.523008 |
0.381594 |
32.204208 |
| std |
257.353842 |
0.486592 |
0.836071 |
0.477990 |
13.019697 |
1.102743 |
0.806057 |
49.693429 |
| min |
1.000000 |
0.000000 |
1.000000 |
0.000000 |
0.420000 |
0.000000 |
0.000000 |
0.000000 |
| 25% |
223.500000 |
0.000000 |
2.000000 |
0.000000 |
22.000000 |
0.000000 |
0.000000 |
7.910400 |
| 50% |
446.000000 |
0.000000 |
3.000000 |
0.000000 |
28.000000 |
0.000000 |
0.000000 |
14.454200 |
| 75% |
668.500000 |
1.000000 |
3.000000 |
1.000000 |
35.000000 |
1.000000 |
0.000000 |
31.000000 |
| max |
891.000000 |
1.000000 |
3.000000 |
1.000000 |
80.000000 |
8.000000 |
6.000000 |
512.329200 |
data['Cabin'] = data['Cabin'].fillna('Unknown')
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 0
Embarked 0
dtype: int64
PassengerId 891
Survived 2
Pclass 3
Name 891
Sex 2
Age 88
SibSp 7
Parch 7
Ticket 681
Fare 248
Cabin 148
Embarked 3
dtype: int64
# Split the data into reference and current datasets
reference_data = data.sample(frac=0.7, random_state=42) # 70% as reference
current_data = data.drop(reference_data.index) # Remaining 30% as current
reference_data.describe(include='all')
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
| count |
624.000000 |
624.000000 |
624.000000 |
624 |
624.000000 |
624.000000 |
624.000000 |
624.000000 |
624 |
624.000000 |
624 |
624 |
| unique |
NaN |
NaN |
NaN |
624 |
NaN |
NaN |
NaN |
NaN |
509 |
NaN |
113 |
3 |
| top |
NaN |
NaN |
NaN |
Greenberg, Mr. Samuel |
NaN |
NaN |
NaN |
NaN |
382652 |
NaN |
Unknown |
S |
| freq |
NaN |
NaN |
NaN |
1 |
NaN |
NaN |
NaN |
NaN |
5 |
NaN |
488 |
440 |
| mean |
438.080128 |
0.391026 |
2.314103 |
NaN |
0.360577 |
29.239984 |
0.504808 |
0.400641 |
NaN |
31.846708 |
NaN |
NaN |
| std |
258.247396 |
0.488372 |
0.831209 |
NaN |
0.480553 |
13.146780 |
1.018280 |
0.815027 |
NaN |
49.422675 |
NaN |
NaN |
| min |
1.000000 |
0.000000 |
1.000000 |
NaN |
0.000000 |
0.750000 |
0.000000 |
0.000000 |
NaN |
0.000000 |
NaN |
NaN |
| 25% |
213.750000 |
0.000000 |
2.000000 |
NaN |
0.000000 |
22.000000 |
0.000000 |
0.000000 |
NaN |
7.925000 |
NaN |
NaN |
| 50% |
434.500000 |
0.000000 |
3.000000 |
NaN |
0.000000 |
28.000000 |
0.000000 |
0.000000 |
NaN |
14.500000 |
NaN |
NaN |
| 75% |
658.250000 |
1.000000 |
3.000000 |
NaN |
1.000000 |
35.000000 |
1.000000 |
1.000000 |
NaN |
30.178100 |
NaN |
NaN |
| max |
891.000000 |
1.000000 |
3.000000 |
NaN |
1.000000 |
80.000000 |
8.000000 |
6.000000 |
NaN |
512.329200 |
NaN |
NaN |
current_data.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 0
Embarked 0
dtype: int64
reference_data.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 0
Embarked 0
dtype: int64
# Run the report
report.run(reference_data=reference_data, current_data=current_data)
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
# Initialize the report with the Data Drift preset
report = Report(metrics=[DataDriftPreset()])
# Run the report
report.run(reference_data=reference_data, current_data=current_data)
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
/home/rajaraman/miniconda3/envs/ml312/lib/python3.12/site-packages/scipy/stats/_stats_py.py:8064: RuntimeWarning: divide by zero encountered in divide
terms = (f_obs_float - f_exp)**2 / f_exp
# Save the report to an HTML file
report.save_html('titanic_data_drift_report.html')
<class 'pandas.core.frame.DataFrame'>
Index: 624 entries, 709 to 714
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 624 non-null int64
1 Survived 624 non-null int64
2 Pclass 624 non-null int64
3 Name 624 non-null object
4 Sex 624 non-null int64
5 Age 624 non-null float64
6 SibSp 624 non-null int64
7 Parch 624 non-null int64
8 Ticket 624 non-null object
9 Fare 624 non-null float64
10 Cabin 624 non-null object
11 Embarked 624 non-null object
dtypes: float64(2), int64(6), object(4)
memory usage: 63.4+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 267 entries, 1 to 887
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 267 non-null int64
1 Survived 267 non-null int64
2 Pclass 267 non-null int64
3 Name 267 non-null object
4 Sex 267 non-null int64
5 Age 267 non-null float64
6 SibSp 267 non-null int64
7 Parch 267 non-null int64
8 Ticket 267 non-null object
9 Fare 267 non-null float64
10 Cabin 267 non-null object
11 Embarked 267 non-null object
dtypes: float64(2), int64(6), object(4)
memory usage: 27.1+ KB
Score: 30