top of page

Exploratory Data Analysis with Sweetviz and Model Prediction Using Transportation Dataset

Import Necessary Packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Preprocess training data

df_training = pd.read_csv("/content/drive/MyDrive/ml_projects/7_transportation_dataset_on_time_late_prediction/train_data.csv")

df_testing = pd.read_csv("/content/drive/MyDrive/ml_projects/7_transportation_dataset_on_time_late_prediction/test_data.csv")
# We do eda with a quite popular library sweetviz
# importing sweetviz
import sweetviz as sv
#analyzing the dataset
advert_report = sv.analyze(df_training)
#display the report
advert_report.show_html('/content/drive/MyDrive/ml_projects/7_transportation_dataset_on_time_late_prediction/output.html')

Checking Shape of training and testing dataset

print(df_training.shape)
print(df_testing.shape)

output:

(9000, 61)
(1478, 61)

df = pd.concat(objs = [df_training, df_testing])



Checking Shape of Combine Training and Testing Dataset

df.shape

output:

(10478, 61)

zip_codes = ['ConsigneeZip', 'ShipperZip', 'DestZip', 'OriginZip']
df.head()

Output:


df.info()

Output:

















#dropping features with more missing values or lesser importance
features_to_be_dropped = ["RAD", "ShipperCountry", "ConsigneeCountry", "ConsigneeCity", 'DetailCity', 'DetailCodeDescription', 'ShipperCity',
'ActualDeliveryTime', 'ActualShipTime', 'AVTime', "target",
 'DestCity', 'OriginCity',  "Lane"]

lst_of_dates = ["ActualShip", "AV",  'Goal', 'Goal2', 'EST_AV', "DetailDate", 'new_EST_AV', 'Final_EST_AV', "CreateDate", 

"ActualDelivery", "TargetShip_Early", "DetailCreateDate", "AV_CD"]

zip_features = ["ConsigneeZip", "ShipperZip", "DestZip", "OriginZip"]

categorical_variables = ["ShipmentType", "CarrierMode", "OnTimeShip", "OnTimeDelivery", "Status", "ShipperState", "DestState", "AS_dow", "new_EST_AV_dow", "Final_EST_AV_dow", "av_dow", 
"OriginCtry", "OriginState", "DestCtry", "DetailCode", "DetailState"]

df.columns

Output:

Index(['ActualShip', 'KEY_LOAD_TRACKING', 'CreateDate', 'ActualDelivery',
       'Carrier', 'ConsigneeCity', 'ConsigneeCountry', 'ConsigneeZip',
       'DetailCity', 'DetailState', 'DetailCode', 'DetailCodeDescription',
       'DetailCreateDate', 'DetailDate', 'ShipperCity', 'ShipperState',
       'ShipperZip', 'ShipperCountry', 'AV_CD', 'AV', 'PRO', 'DestCity',
       'DestState', 'DestCtry', 'DestZip', 'OriginCity', 'OriginState',
       'OriginCtry', 'OriginZip', 'Lane', 'ShipmentType', 'CarrierMode',
       'ActualTransitTime', 'DeliveryDays', 'ServiceDays', 'Weight',
       'CustomerDistance', 'RAD', 'DestName', 'Goal', 'Goal2', 'InvoiceCost',
       'Mileage', 'OnTimeShip', 'OnTimeDelivery', 'Quantity',
       'TargetShip_Early', 'Status', 'ActualDeliveryTime', 'ActualShipTime',
       'AVTime', 'av_dow', 'AS_dow', 'EST_AV', 'holiday_flag', 'new_EST_AV',
       'new_EST_AV_dow', 'Final_EST_AV', 'Final_EST_AV_dow', 'target',
       'target_numerical'],
      dtype='object')

for x in categorical_variables:
  len(df[x].value_counts())
df = df.drop(features_to_be_dropped, axis=1)
pd.get_dummies(df["Status"])

output:
















for x in categorical_variables:
  import gc
  one_hot = pd.get_dummies(df[x], prefix = x)
  df = pd.concat([df, one_hot], axis=1)
  df = df.drop(x, axis = 1)
  del one_hot
  gc.collect()
df[zip_features] = df[zip_features].apply(pd.to_numeric, errors='coerce')
df.info()

output:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10478 entries, 0 to 1477
Columns: 285 entries, ActualShip to DetailState_WY
dtypes: bool(1), float64(11), int64(6), object(13), uint8(254)
memory usage: 5.0+ MB
df

Output:


df
def handle_date_time(input_feature):
  times = pd.to_datetime(df[input_feature], 
  format = "%Y-%m-%d %H:%M:%S.", 
  errors = "coerce")
  df[input_feature + "_year"] = times.dt.year
  df[input_feature + "_month"] = times.dt.month
  df[input_feature + "_day"] = times.dt.day
  df[input_feature + "_hour"] = times.dt.hour
  df[input_feature + "_minute"] = times.dt.minute
  df[input_feature + "_dayofweek"] = times.dt.dayofweek
  df.drop([input_feature], axis = 1, inplace=True)

df.duplicated().sum()
for x in lst_of_dates:
  print("in here for", x)
  handle_date_time(x), 

Output:










df.columns

Output:

Index(['KEY_LOAD_TRACKING', 'Carrier', 'ConsigneeZip', 'ShipperZip', 'PRO',
       'DestZip', 'OriginZip', 'ActualTransitTime', 'DeliveryDays',
       'ServiceDays',
       ...
       'DetailCreateDate_day', 'DetailCreateDate_hour',
       'DetailCreateDate_minute', 'DetailCreateDate_dayofweek', 'AV_CD_year',
       'AV_CD_month', 'AV_CD_day', 'AV_CD_hour', 'AV_CD_minute',
       'AV_CD_dayofweek'],
      dtype='object', length=350)

df.info()

Output:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10478 entries, 0 to 1477
Columns: 350 entries, KEY_LOAD_TRACKING to AV_CD_dayofweek
dtypes: bool(1), float64(23), int64(72), uint8(254)
memory usage: 10.2 MB
df.isnull().sum()

Output:












len(df)

output:

10478
df_training = df[:9000]
df_testing = df[9000:]
df_training.isnull().sum().sum()
df_testing.isnull().sum().sum()
df_training = df_training.reset_index()
del df_training['index']

df_testing = df_testing.reset_index()
del df_testing['index']
X_training = df_training.drop(["target_numerical"], axis = 1)
y_training = df_training["target_numerical"]
len(X_training.iloc[0])
len(y_training)
X_testing = df_testing.drop(["target_numerical"], axis = 1)
y_testing = df_testing["target_numerical"]
len(X_testing.iloc[0])
len(y_testing)
zip_features

output:

['ConsigneeZip', 'ShipperZip', 'DestZip', 'OriginZip']



Model Building

Logistic Regression

from sklearn.linear_model import LogisticRegression
logModel = LogisticRegression()
param_grid = [    
    {
    # 'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'penalty' : ['l2'],
    'C' : np.logspace(-4, 4, 20),
    # 'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'solver' : ['lbfgs'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(logModel, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_training,y_training)

Output:

Fitting 3 folds for each of 80 candidates, totalling 240 fits


best_clf.best_estimator_

output:

LogisticRegression(C=0.0001)


print (f'Accuracy - : {best_clf.score(X_testing,y_testing):.3f}')

Output:

Accuracy - : 0.699
predictions = best_clf.predict(X_testing)
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print(accuracy_score(y_testing, predictions))
print(recall_score(y_testing, predictions))
print(precision_score(y_testing, predictions))
print(f1_score(y_testing, predictions))

Output:

0.6989174560216509

1.0

0.6989174560216509

0.8227797690163282



from sklearn.metrics import confusion_matrix, classification_report
def plot_confusion_matrix(cf_matrix):
  import seaborn as sns

  ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')

  ax.set_title('Seaborn Confusion Matrix with labels\n\n');
  ax.set_xlabel('\nPredicted Values')
  ax.set_ylabel('Actual Values ');

  ## Ticket labels - List must be in alphabetical order
  ax.xaxis.set_ticklabels(['False','True'])
  ax.yaxis.set_ticklabels(['False','True'])

  ## Display the visualization of the Confusion Matrix.
  plt.show()
#Generate the confusion matrix
cf_matrix = confusion_matrix(y_testing, predictions)

print(cf_matrix)

Output:

[[ 0 445]

[ 0 1033]]


plot_confusion_matrix(cf_matrix)

Output:













RandomForest

n_estimators = [5,20,50,100] # number of trees in the random forest
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
random_grid = {'n_estimators': n_estimators,
'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

}
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid, cv = 5, verbose=100, random_state=35, n_jobs = -1)
rf_random.fit(X_training, y_training)

Output:

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1, param_distributions={'min_samples_leaf': [1, 3, 4], 'min_samples_split': [2, 6, 10], 'n_estimators': [5, 20, 50, 100]}, random_state=35, verbose=100)


print ('Random grid: ', random_grid, '\n')
# print the best parameters
print ('Best Parameters: ', rf_random.best_params_, ' \n')

Output:

Random grid:  {'n_estimators': [5, 20, 50, 100], 'min_samples_split': [2, 6, 10], 'min_samples_leaf': [1, 3, 4]} 

Best Parameters:  {'n_estimators': 20, 'min_samples_split': 6, 'min_samples_leaf': 3}  


randmf = RandomForestClassifier(n_estimators = 100, min_samples_split = 6, min_samples_leaf= 4, max_features = 'sqrt', max_depth= 120, bootstrap=False) 
randmf.fit( X_training, y_training) 

Output:

RandomForestClassifier(bootstrap=False, max_depth=120, max_features='sqrt', min_samples_leaf=4, min_samples_split=6)



predictions = rf_random.predict(X_testing)
predictions

Output:

array([1, 1, 1, ..., 0, 1, 0])

print (f'Accuracy - : {rf_random.score(X_testing,y_testing):.3f}')

Output:

Accuracy - : 0.806



print(accuracy_score(y_testing, predictions))
print(recall_score(y_testing, predictions))
print(precision_score(y_testing, predictions))
print(f1_score(y_testing, predictions))

Output:

0.8064952638700947

0.9341723136495643

0.8157227387996618

0.8709386281588447



#Generate the confusion matrix
cf_matrix = confusion_matrix(y_testing, predictions)

print(cf_matrix)

Output:

[[227 218]

[ 68 965]]



plot_confusion_matrix(cf_matrix)

Output:














SVM

from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1],
              'gamma': [0.1, 0.001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv = 3)
 
# fitting the model for grid search
grid.fit(X_training, y_training)

Output:

Fitting 3 folds for each of 4 candidates, totalling 12 fits [CV 1/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.810 total time= 13.3s [CV 2/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.810 total time= 12.3s [CV 3/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.810 total time= 13.6s [CV 1/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.810 total time= 14.7s [CV 2/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.810 total time= 12.6s [CV 3/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.810 total time= 12.8s [CV 1/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.810 total time= 12.6s [CV 2/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.810 total time= 12.7s [CV 3/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.810 total time= 12.7s [CV 1/3] END ......C=1, gamma=0.001, kernel=rbf;, score=0.810 total time= 13.1s [CV 2/3] END ......C=1, gamma=0.001, kernel=rbf;, score=0.810 total time= 13.0s [CV 3/3] END ......C=1, gamma=0.001, kernel=rbf;, score=0.810 total time= 12.9s


GridSearchCV(cv=3, estimator=SVC(), param_grid={'C': [0.1, 1], 'gamma': [0.1, 0.001], 'kernel': ['rbf']}, verbose=3)



# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

Output:

{'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'} SVC(C=0.1, gamma=0.1)



predictions = grid.predict(X_testing)
print(accuracy_score(y_testing, predictions))
print(recall_score(y_testing, predictions))
print(precision_score(y_testing, predictions))
print(f1_score(y_testing, predictions))

Output:

0.6989174560216509 1.0 0.6989174560216509 0.8227797690163282



#Generate the confusion matrix
cf_matrix = confusion_matrix(y_testing, predictions)

print(cf_matrix)

Output:

[[   0  445]
 [   0 1033]]
plot_confusion_matrix(cf_matrix)

Output:





bottom of page