top of page

Classification with the Iris Dataset: Hyperparameter Tuning

Requirement

Using scikit-learn’s KFold class and cross_val_score function, determine the optimal k value for classifying Iris samples using a KNeighborsClassifier.


Import Related Libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
#Loading data from sklearn package
from sklearn import datasets
iris = datasets.load_iris()


DECISION TREE

Let's compare the difference in performance using both the available criteria: 'gini' and 'entropy'.


from sklearn import tree        #importing subpackage from sklearn library
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

tree_score_table = pd.DataFrame(columns=['Criterion', 'Accuracy'])    
#creating table to store accuracy results
tree_crit = ['gini', 'entropy']         #criteria to be tested
tree_score_table['Criterion'] = tree_crit       #fill column 'Criterion' with test_crit

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)        #determining where to separe test and train data

j = 0
for i in tree_crit:     #loop over criterion
    tree_model = make_pipeline(preprocessing.StandardScaler(), tree.DecisionTreeClassifier(criterion=i))       #initialize decision tree
    tree_score_table.iloc[j, 1] = cross_val_score(tree_model, iris.data, iris.target, cv = cv).mean()        #cross validation for decision tree
    j += 1        #update j
    
print(tree_score_table)

Output:

 Criterion  Accuracy
0      gini  0.942222
1   entropy  0.942222

Well, there are a lot of parameters to optimize in the decision tree. Let's tune the hyper-parameters of it by an exhaustive grid search using the GridSearchCV

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
criteria = ["gini", "entropy"]      #criteria to be tested
min_sample_split_range = [2,10, 20] #min sample split to be tested
max_depth_range = [None, 2, 5, 10]  #max depth to be tested
min_samples_leaf_range = [1, 5, 10] #min samples in the leaf to be tested
min_leaf_nodes_range = [None, 5, 10, 20]    #min leaf nodes to be tested

param_grid = {"criterion": criteria,
              "min_samples_split": min_sample_split_range,
              "max_depth": max_depth_range,
              "min_samples_leaf": min_samples_leaf_range,
              "max_leaf_nodes": min_leaf_nodes_range
                }

grid = GridSearchCV(estimator=tree.DecisionTreeClassifier(), 
                    param_grid=param_grid, 
                    cv = 5, 
                    scoring='accuracy', 
                    refit=True)     #setting grid with estimator

tree_model = make_pipeline(preprocessing.StandardScaler(), grid)    #creating preprocessing
tree_model.fit(iris.data, iris.target)      #fitting data

print("Accuracy of the tuned model: %.4f" %grid.best_score_)
print(grid.best_params_)

Output:

Accuracy of the tuned model: 0.9733
{'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}

Let's repeat the process for SVM

from sklearn import svm
kernel_types = ["linear", "poly", "rbf", "sigmoid"]     #types of kernels to be tested
C_range = [0.01, 0.1, 1, 10, 100, 1000]                 #range of C to be tested
degree_range = [1, 2, 3, 4, 5, 6]                       #degrees to be tested

param_grid = {"kernel": kernel_types,
              "C": C_range,
              "degree": degree_range,
              }         #setting grid of parameters

grid = GridSearchCV(estimator = svm.SVC(), 
                    param_grid = param_grid, 
                    cv = 5, 
                    scoring = 'accuracy', 
                    refit = True)   #setting grid with estimator

svm_model = make_pipeline(preprocessing.StandardScaler(), grid)     #creating preprocessing
svm_model.fit(iris.data, iris.target)       #fitting data

print("Accuracy of the tuned model: %.4f" %grid.best_score_)
print(grid.best_params_)

Output:

Accuracy of the tuned model: 0.9733 {'C': 10, 'degree': 1, 'kernel': 'rbf'}


And finally for Knn

from sklearn.neighbors import KNeighborsClassifier
weight_functions = ["uniform", "distance"]
p_values = [1, 2]
n_range = list(range(1, 51))

param_grid = {"n_neighbors": n_range,
              "weights": weight_functions,
              "p": p_values
              }         #setting grid of parameters

grid = GridSearchCV(estimator = KNeighborsClassifier(), 
                    param_grid = param_grid, 
                    cv = 5, 
                    scoring = 'accuracy', 
                    refit = True)   #setting grid with estimator

knn_model = make_pipeline(preprocessing.StandardScaler(), grid)     #creating preprocessing
knn_model.fit(iris.data, iris.target)       #fitting data

print("Accuracy of the tuned model: %.4f" %grid.best_score_)
print(grid.best_params_)

Output:

Accuracy of the tuned model: 0.9667
{'n_neighbors': 6, 'p': 2, 'weights': 'uniform'}
bottom of page