In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

%matplotlib inline

## Scikit Pipeline Demo - organizing data processing workflow

In [2]:
X, y = make_classification(n_samples=100, n_features=20, n_classes=2)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75)

In [4]:
pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('svc', SVC(C=1.0, kernel='rbf'))
])

pipeline = pipeline.fit(X_train, y_train)

score_train = pipeline.score(X_train, y_train)
score_test = pipeline.score(X_test, y_test)

print(score_train)
print(score_test)

1.0
0.76


In [5]:
pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('svc', SVC())
])

pipeline = pipeline.set_params(svc__C=0.5)
pipeline = pipeline.set_params(svc__kernel='linear')

pipeline = pipeline.fit(X_train, y_train)

score_train = pipeline.score(X_train, y_train)
score_test = pipeline.score(X_test, y_test)

print(score_train)
print(score_test)

0.9466666666666667
0.88


## Scikit GridSearch Demo - optimizing hyperparameters with a simple grid search

In [6]:
gs_parameters = {
    'svc__C': [0.25, 0.50, 0.75, 1, 5, 10, 15, 20, 25],
    'svc__kernel': ('linear', 'rbf')
}

gs = GridSearchCV(pipeline, gs_parameters)

gs.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svc', SVC(C=0.5, kernel='linear'))]),
             param_grid={'svc__C': [0.25, 0.5, 0.75, 1, 5, 10, 15, 20, 25],
                         'svc__kernel': ('linear', 'rbf')})

In [7]:
gs.best_params_

{'svc__C': 0.25, 'svc__kernel': 'linear'}

In [8]:
gs.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('svc', SVC(C=0.25, kernel='linear'))])

In [9]:
best_model = gs.best_estimator_

best_model = best_model.fit(X_train, y_train)

score_train = best_model.score(X_train, y_train)
score_test = best_model.score(X_test, y_test)

print(score_train)
print(score_test)

0.96
0.92


In [10]:
gs.cv_results_

{'mean_fit_time': array([0.0007431 , 0.0006875 , 0.00069399, 0.00069633, 0.00072637,
        0.00069613, 0.00073376, 0.00071063, 0.00078444, 0.00071974,
        0.00077457, 0.00075827, 0.00083036, 0.00071406, 0.00082779,
        0.00071979, 0.00082173, 0.00072775]),
 'std_fit_time': array([6.31983571e-05, 1.64122058e-05, 2.80497812e-05, 1.36500415e-05,
        4.10511101e-05, 1.59575961e-05, 5.09740037e-05, 6.83326115e-06,
        9.76290050e-05, 1.70674401e-05, 5.22756644e-05, 7.81114865e-05,
        1.29716483e-04, 1.95381080e-05, 1.25800813e-04, 1.36220284e-05,
        1.24708698e-04, 2.32624215e-05]),
 'mean_score_time': array([0.00024657, 0.0002728 , 0.00025783, 0.00026727, 0.00024786,
        0.00026088, 0.00024991, 0.00026245, 0.00024228, 0.00026612,
        0.00025239, 0.00026498, 0.0002521 , 0.0002737 , 0.00024614,
        0.00026884, 0.00025859, 0.0002717 ]),
 'std_score_time': array([8.86891477e-06, 1.54626167e-05, 1.58562516e-05, 1.50585446e-05,
        1.35589549e-05, 1.72

### Assignments:

1. Train all the models on the train dataset (`Pipeline.fit()`), choose the model with the best parameters setting using the validation dataset (`GridSearchCV.fit()`), and finally test the best model on the test dataset (`GridSearchCV.best_estimator_.fit()`). It requires refactoring the part of code with the `train_test_split()` function (it should be run twice: first, for splitting the entire dataset into (temporary) train and test datasets, and second, for splitting the (temporary) train dataset into train and validation datasets).

2. Try to use your own scoring function in the grid search (`GridSearchCV(scoring=my_own_scoring)`). As `Pipeline.score()` uses the `score()` method of the final estimator in the pipeline, being `SVC.score()` in our example, it computes the mean accuracy of the classifier. Our dataset may be imbalanced, i.e. one class may occur more frequently than the other, so it would be better to replace the mean accuracy with the F1 score.

## Optuna Demo - optimizing hyperparameters with an optimization framework

In [11]:
# pip install optuna
# conda install conda-forge::optuna

import optuna

In [12]:
def objective_function(trial):
    x = trial.suggest_float('x', -np.pi, np.pi)
    return (np.sin(x) - 0.25 * np.pi) ** 2

study = optuna.create_study()

study.optimize(objective_function, n_trials=100)

study.best_params

[32m[I 2025-02-13 19:59:47,099][0m A new study created in memory with name: no-name-53270093-68b8-427b-942c-7af3fb21076c[0m
[32m[I 2025-02-13 19:59:47,101][0m Trial 0 finished with value: 2.442209960716248 and parameters: {'x': -2.251136138122833}. Best is trial 0 with value: 2.442209960716248.[0m
[32m[I 2025-02-13 19:59:47,101][0m Trial 1 finished with value: 0.8840172242117759 and parameters: {'x': -0.15544903774629404}. Best is trial 1 with value: 0.8840172242117759.[0m
[32m[I 2025-02-13 19:59:47,102][0m Trial 2 finished with value: 0.5690662809736513 and parameters: {'x': 0.031038327213768202}. Best is trial 2 with value: 0.5690662809736513.[0m
[32m[I 2025-02-13 19:59:47,103][0m Trial 3 finished with value: 3.144941166427002 and parameters: {'x': -1.7258707798212531}. Best is trial 2 with value: 0.5690662809736513.[0m
[32m[I 2025-02-13 19:59:47,103][0m Trial 4 finished with value: 2.762684822531169 and parameters: {'x': -2.072562490929962}. Best is trial 2 with val

{'x': 0.9055632385366408}

In [13]:
def objective_function(trial):
    svc_C = trial.suggest_float('svc__C', 0, 25)
    svc_kernel = trial.suggest_categorical('svc__kernel', ['linear', 'rbf'])

    pipeline = Pipeline([
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('svc', SVC())
    ])

    pipeline = pipeline.set_params(svc__C=svc_C)
    pipeline = pipeline.set_params(svc__kernel=svc_kernel)

    pipeline = pipeline.fit(X_train, y_train)

    score_test = pipeline.score(X_test, y_test)

    return score_test

study = optuna.create_study(direction='maximize')

study.optimize(objective_function, n_trials=100)

study.best_params

[32m[I 2025-02-13 19:59:47,335][0m A new study created in memory with name: no-name-b3297593-9335-4197-bf97-eb3a54514cb0[0m
[32m[I 2025-02-13 19:59:47,338][0m Trial 0 finished with value: 0.88 and parameters: {'svc__C': 5.323579309369189, 'svc__kernel': 'linear'}. Best is trial 0 with value: 0.88.[0m
[32m[I 2025-02-13 19:59:47,340][0m Trial 1 finished with value: 0.88 and parameters: {'svc__C': 3.5098561831395676, 'svc__kernel': 'linear'}. Best is trial 0 with value: 0.88.[0m
[32m[I 2025-02-13 19:59:47,342][0m Trial 2 finished with value: 0.88 and parameters: {'svc__C': 23.063763663898822, 'svc__kernel': 'linear'}. Best is trial 0 with value: 0.88.[0m
[32m[I 2025-02-13 19:59:47,344][0m Trial 3 finished with value: 0.88 and parameters: {'svc__C': 7.5086287363431925, 'svc__kernel': 'linear'}. Best is trial 0 with value: 0.88.[0m
[32m[I 2025-02-13 19:59:47,347][0m Trial 4 finished with value: 0.88 and parameters: {'svc__C': 14.15566399202938, 'svc__kernel': 'linear'}. Bes

{'svc__C': 0.038870767685469064, 'svc__kernel': 'linear'}

In [14]:
# svc_C = study.best_params['svc__C']
# svc_kernel = study.best_params['svc__kernel']

# best_model = Pipeline([
#         ('scaler', StandardScaler(with_mean=True, with_std=True)),
#         ('svc', SVC(C=svc_C, kernel=svc_kernel))
#     ])

best_model = Pipeline([
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('svc', SVC())
    ])

best_model.set_params(**study.best_params)

best_model = best_model.fit(X_train, y_train)

score_train = best_model.score(X_train, y_train)
score_test = best_model.score(X_test, y_test)

print(score_train)
print(score_test)

0.9333333333333333
0.96
