top of page

Decision Tree Regressor And Support Vector Regression With K-fold Cross Validation

Here we have implement two Data Mining Techniques, below the some basic steps which used to implement Boston House Pricing data.


Data Mining Basics

1) Data pre-processing: Data categorical feature transformation, Data numerical feature normalization, Missing values imputation, and Cross-Validation.


Techniques

1) Decision Tree Regression

2) Support Vector Methodology

3) K-NN prediction modelling

4) K-mean clustering

5) Naïve Bayesian


Import Libraries

#import libraires
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error, make_scorer
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import itertools 
import numpy as np
import matplotlib.pyplot as plt

Read CSV

#Read Dataset
df = pd.read_csv("Boston Real Est.csv")
df

Output:












Checking Missing Values

#Checking Null Value
#Visualize for check null value
check_null_value = df.isnull()
sns.heatmap(check_null_value,yticklabels=False,cbar=False,cmap='viridis')

Output:












Show Dataset Columns:

#show data frame all columns
df.columns

Output:

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT', 'MEDV'], dtype='object')


In above heatmap we an see their is two missing values in RM Now we need to remove it using median


Remove Missing Values by median

#Remove Missing Values by Median
df['RM'].fillna(df['RM'].median(), inplace=True)

Checking Again Missing Values

#Now checking again Nan value
#Visualize for check null value
check_null_value = df.isnull()
sns.heatmap(check_null_value,yticklabels=False,cbar=False,cmap='viridis')

Output:












In above heat map we can she there are no missing values, all is replace by median.


#Checking shape of dataset
df.shape

Output:

(511, 13)
#checking dataset information
df.info()

Output:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     511 non-null    float64
 1   ZN       511 non-null    float64
 2   INDUS    511 non-null    float64
 3   CHAS     511 non-null    int64  
 4   NOX      511 non-null    float64
 5   RM       511 non-null    float64
 6   AGE      511 non-null    float64
 7   DIS      511 non-null    float64
 8   RAD      511 non-null    int64  
 9   TAX      511 non-null    int64  
 10  PTRATIO  511 non-null    float64
 11  LSTAT    511 non-null    float64
 12  MEDV     511 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 52.0 KB


Summary Statistics

#checking summary of dataset
df.describe()

Output:


Feature Selection

#Deviding the target and features variables
X = df.drop('MEDV', axis = 1)
y = df['MEDV']

Normalize Features

from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)


Split Dataset

#split
# Import 'train_test_split'
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=2)

# Success
print("Training and testing split was successful.")

K-fold Cross Validation

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import numpy as np
kf = KFold(n_splits=10)


SVR(Support Vector Regression)

from sklearn.svm import SVR
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_rbf.fit( X_train, y_train)
scores = cross_val_score(svr_rbf, X_train, y_train, cv=kf)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Output:

Accuracy: 0.77 (+/- 0.12)

Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor
desc_tr = DecisionTreeRegressor(max_depth=3)
desc_tr.fit(X_train, y_train)
scores = cross_val_score(desc_tr, X_train, y_train, cv=kf)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Output:

Accuracy: 0.62 (+/- 0.12)

Draw the Graph

!pip install graphviz
!pip install pydotplus
import six
import sys
sys.modules['sklearn.externals.six'] = six
#draw tree
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
feature_cols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']
dot_data = StringIO()
export_graphviz(desc_tr, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('boston.png')
Image(graph.create_png())

Output:



KNN

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit( X_train, y_train)
scores = cross_val_score(knn, X_train, y_train, cv=kf)
print("KNN Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Output:

KNN Accuracy: 0.64 (+/- 0.12

bottom of page