3-Bayesian-Optimization-1
Sat 17 May 2025
import pyutil as pyu
pyu.get_local_pyinfo()
'conda env: ml312-2024; pyv: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct 4 2024, 13:27:36) [GCC 11.2.0]'
# !pip install optuna
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import optuna
# Load the Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(url)
# Preprocessing
# Select features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[features]
y = data['Survived']
# Handle missing values
X.loc[:, 'Age'] = X['Age'].fillna(X['Age'].median())
X.loc[:, 'Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])
# Convert categorical variables to numerical
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define the objective function for Bayesian Optimization
def objective(trial):
# Define hyperparameters to optimize
n_estimators = trial.suggest_int('n_estimators', 100, 500)
max_depth = trial.suggest_int('max_depth', 5, 20)
min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
# Create the model with suggested hyperparameters
model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_split=min_samples_split,
random_state=42
)
# Perform 5-fold cross-validation
score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
# Return the mean accuracy
return score.mean()
# Create the Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
[I 2024-12-06 12:22:46,877] A new study created in memory with name: no-name-8d2628a4-bf66-4273-8f97-99dfa629bd7a
[I 2024-12-06 12:22:48,446] Trial 0 finished with value: 0.8229685807150595 and parameters: {'n_estimators': 253, 'max_depth': 9, 'min_samples_split': 10}. Best is trial 0 with value: 0.8229685807150595.
[I 2024-12-06 12:22:49,184] Trial 1 finished with value: 0.8243967300305328 and parameters: {'n_estimators': 166, 'max_depth': 7, 'min_samples_split': 6}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:22:50,066] Trial 2 finished with value: 0.8104205653501427 and parameters: {'n_estimators': 247, 'max_depth': 16, 'min_samples_split': 3}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:22:52,141] Trial 3 finished with value: 0.8187826258248794 and parameters: {'n_estimators': 363, 'max_depth': 8, 'min_samples_split': 6}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:22:54,509] Trial 4 finished with value: 0.8160149709445484 and parameters: {'n_estimators': 388, 'max_depth': 15, 'min_samples_split': 6}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:22:55,995] Trial 5 finished with value: 0.8104107160445189 and parameters: {'n_estimators': 264, 'max_depth': 18, 'min_samples_split': 3}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:22:58,015] Trial 6 finished with value: 0.8160248202501723 and parameters: {'n_estimators': 383, 'max_depth': 20, 'min_samples_split': 6}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:22:59,966] Trial 7 finished with value: 0.8229882793263075 and parameters: {'n_estimators': 373, 'max_depth': 9, 'min_samples_split': 9}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:23:01,241] Trial 8 finished with value: 0.8188417216586231 and parameters: {'n_estimators': 259, 'max_depth': 17, 'min_samples_split': 4}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:23:02,498] Trial 9 finished with value: 0.8146163695459471 and parameters: {'n_estimators': 257, 'max_depth': 16, 'min_samples_split': 7}. Best is trial 1 with value: 0.8243967300305328.
[I 2024-12-06 12:23:03,015] Trial 10 finished with value: 0.8286319314488327 and parameters: {'n_estimators': 113, 'max_depth': 5, 'min_samples_split': 8}. Best is trial 10 with value: 0.8286319314488327.
[I 2024-12-06 12:23:03,486] Trial 11 finished with value: 0.8300305328474341 and parameters: {'n_estimators': 104, 'max_depth': 5, 'min_samples_split': 8}. Best is trial 11 with value: 0.8300305328474341.
[I 2024-12-06 12:23:04,031] Trial 12 finished with value: 0.8272234807446075 and parameters: {'n_estimators': 125, 'max_depth': 5, 'min_samples_split': 8}. Best is trial 11 with value: 0.8300305328474341.
[I 2024-12-06 12:23:06,285] Trial 13 finished with value: 0.8188023244361272 and parameters: {'n_estimators': 472, 'max_depth': 12, 'min_samples_split': 8}. Best is trial 11 with value: 0.8300305328474341.
[I 2024-12-06 12:23:06,891] Trial 14 finished with value: 0.8300206835418104 and parameters: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 10}. Best is trial 11 with value: 0.8300305328474341.
[I 2024-12-06 12:23:08,100] Trial 15 finished with value: 0.8286220821432089 and parameters: {'n_estimators': 180, 'max_depth': 12, 'min_samples_split': 10}. Best is trial 11 with value: 0.8300305328474341.
[I 2024-12-06 12:23:09,151] Trial 16 finished with value: 0.8342361863488623 and parameters: {'n_estimators': 182, 'max_depth': 6, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:10,213] Trial 17 finished with value: 0.8229882793263075 and parameters: {'n_estimators': 193, 'max_depth': 10, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:10,979] Trial 18 finished with value: 0.8243967300305328 and parameters: {'n_estimators': 146, 'max_depth': 7, 'min_samples_split': 7}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:11,925] Trial 19 finished with value: 0.8174037230375258 and parameters: {'n_estimators': 208, 'max_depth': 11, 'min_samples_split': 5}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:13,503] Trial 20 finished with value: 0.8188023244361272 and parameters: {'n_estimators': 323, 'max_depth': 14, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:14,056] Trial 21 finished with value: 0.8314291342460358 and parameters: {'n_estimators': 107, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:14,708] Trial 22 finished with value: 0.828612232837585 and parameters: {'n_estimators': 153, 'max_depth': 7, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:15,619] Trial 23 finished with value: 0.8328277356446371 and parameters: {'n_estimators': 215, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:16,571] Trial 24 finished with value: 0.824386880724909 and parameters: {'n_estimators': 203, 'max_depth': 7, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:17,630] Trial 25 finished with value: 0.8314192849404117 and parameters: {'n_estimators': 219, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:18,305] Trial 26 finished with value: 0.8257854821235103 and parameters: {'n_estimators': 142, 'max_depth': 9, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:19,751] Trial 27 finished with value: 0.817374175120654 and parameters: {'n_estimators': 310, 'max_depth': 8, 'min_samples_split': 7}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:20,816] Trial 28 finished with value: 0.8300206835418104 and parameters: {'n_estimators': 229, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:21,666] Trial 29 finished with value: 0.8201516793066089 and parameters: {'n_estimators': 169, 'max_depth': 10, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:23,388] Trial 30 finished with value: 0.8257854821235103 and parameters: {'n_estimators': 289, 'max_depth': 8, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:24,766] Trial 31 finished with value: 0.8314192849404117 and parameters: {'n_estimators': 216, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:26,185] Trial 32 finished with value: 0.828612232837585 and parameters: {'n_estimators': 234, 'max_depth': 6, 'min_samples_split': 2}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:27,420] Trial 33 finished with value: 0.832827735644637 and parameters: {'n_estimators': 185, 'max_depth': 6, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:28,428] Trial 34 finished with value: 0.8286023835319611 and parameters: {'n_estimators': 131, 'max_depth': 8, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:29,163] Trial 35 finished with value: 0.8187727765192554 and parameters: {'n_estimators': 179, 'max_depth': 9, 'min_samples_split': 8}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:29,887] Trial 36 finished with value: 0.8286220821432089 and parameters: {'n_estimators': 162, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:31,531] Trial 37 finished with value: 0.8243770314192849 and parameters: {'n_estimators': 286, 'max_depth': 7, 'min_samples_split': 7}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:32,902] Trial 38 finished with value: 0.8188417216586231 and parameters: {'n_estimators': 187, 'max_depth': 14, 'min_samples_split': 5}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:34,416] Trial 39 finished with value: 0.8243967300305328 and parameters: {'n_estimators': 244, 'max_depth': 10, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:36,765] Trial 40 finished with value: 0.8201812272234807 and parameters: {'n_estimators': 423, 'max_depth': 8, 'min_samples_split': 8}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:37,907] Trial 41 finished with value: 0.8328277356446371 and parameters: {'n_estimators': 214, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:39,198] Trial 42 finished with value: 0.8229981286319316 and parameters: {'n_estimators': 266, 'max_depth': 5, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:39,820] Trial 43 finished with value: 0.8342361863488623 and parameters: {'n_estimators': 133, 'max_depth': 6, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:40,532] Trial 44 finished with value: 0.8314291342460356 and parameters: {'n_estimators': 164, 'max_depth': 7, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:41,415] Trial 45 finished with value: 0.8216290751502019 and parameters: {'n_estimators': 195, 'max_depth': 20, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:42,066] Trial 46 finished with value: 0.8258150300403819 and parameters: {'n_estimators': 129, 'max_depth': 5, 'min_samples_split': 8}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:43,580] Trial 47 finished with value: 0.8229685807150597 and parameters: {'n_estimators': 337, 'max_depth': 8, 'min_samples_split': 9}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:44,815] Trial 48 finished with value: 0.8187727765192554 and parameters: {'n_estimators': 243, 'max_depth': 9, 'min_samples_split': 8}. Best is trial 16 with value: 0.8342361863488623.
[I 2024-12-06 12:23:45,514] Trial 49 finished with value: 0.8243967300305328 and parameters: {'n_estimators': 143, 'max_depth': 5, 'min_samples_split': 10}. Best is trial 16 with value: 0.8342361863488623.
# Print the best hyperparameters and score
print("Best Parameters:", study.best_params)
print("Best Score:", study.best_value)
Best Parameters: {'n_estimators': 182, 'max_depth': 6, 'min_samples_split': 9}
Best Score: 0.8342361863488623
# Train the model with the best parameters
best_params = study.best_params
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)
RandomForestClassifier(max_depth=6, min_samples_split=9, n_estimators=182,
random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=6, min_samples_split=9, n_estimators=182,
random_state=42)# Evaluate on the test set
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
Test Accuracy: 0.8156424581005587
Score: 15
Category: hyperparam-tuning