Import Necessary Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
Preprocess training data
df_training = pd.read_csv("/content/drive/MyDrive/ml_projects/7_transportation_dataset_on_time_late_prediction/train_data.csv")
df_testing = pd.read_csv("/content/drive/MyDrive/ml_projects/7_transportation_dataset_on_time_late_prediction/test_data.csv")
# We do eda with a quite popular library sweetviz
# importing sweetviz
import sweetviz as sv
#analyzing the dataset
advert_report = sv.analyze(df_training)
#display the report
advert_report.show_html('/content/drive/MyDrive/ml_projects/7_transportation_dataset_on_time_late_prediction/output.html')
Checking Shape of training and testing dataset
print(df_training.shape)
print(df_testing.shape)
output:
(9000, 61)
(1478, 61)
df = pd.concat(objs = [df_training, df_testing])
Checking Shape of Combine Training and Testing Dataset
df.shape
output:
(10478, 61)
zip_codes = ['ConsigneeZip', 'ShipperZip', 'DestZip', 'OriginZip']
df.head()
Output:

df.info()
Output:

#dropping features with more missing values or lesser importance
features_to_be_dropped = ["RAD", "ShipperCountry", "ConsigneeCountry", "ConsigneeCity", 'DetailCity', 'DetailCodeDescription', 'ShipperCity',
'ActualDeliveryTime', 'ActualShipTime', 'AVTime', "target",
'DestCity', 'OriginCity', "Lane"]
lst_of_dates = ["ActualShip", "AV", 'Goal', 'Goal2', 'EST_AV', "DetailDate", 'new_EST_AV', 'Final_EST_AV', "CreateDate",
"ActualDelivery", "TargetShip_Early", "DetailCreateDate", "AV_CD"]
zip_features = ["ConsigneeZip", "ShipperZip", "DestZip", "OriginZip"]
categorical_variables = ["ShipmentType", "CarrierMode", "OnTimeShip", "OnTimeDelivery", "Status", "ShipperState", "DestState", "AS_dow", "new_EST_AV_dow", "Final_EST_AV_dow", "av_dow",
"OriginCtry", "OriginState", "DestCtry", "DetailCode", "DetailState"]
df.columns
Output:
Index(['ActualShip', 'KEY_LOAD_TRACKING', 'CreateDate', 'ActualDelivery',
'Carrier', 'ConsigneeCity', 'ConsigneeCountry', 'ConsigneeZip',
'DetailCity', 'DetailState', 'DetailCode', 'DetailCodeDescription',
'DetailCreateDate', 'DetailDate', 'ShipperCity', 'ShipperState',
'ShipperZip', 'ShipperCountry', 'AV_CD', 'AV', 'PRO', 'DestCity',
'DestState', 'DestCtry', 'DestZip', 'OriginCity', 'OriginState',
'OriginCtry', 'OriginZip', 'Lane', 'ShipmentType', 'CarrierMode',
'ActualTransitTime', 'DeliveryDays', 'ServiceDays', 'Weight',
'CustomerDistance', 'RAD', 'DestName', 'Goal', 'Goal2', 'InvoiceCost',
'Mileage', 'OnTimeShip', 'OnTimeDelivery', 'Quantity',
'TargetShip_Early', 'Status', 'ActualDeliveryTime', 'ActualShipTime',
'AVTime', 'av_dow', 'AS_dow', 'EST_AV', 'holiday_flag', 'new_EST_AV',
'new_EST_AV_dow', 'Final_EST_AV', 'Final_EST_AV_dow', 'target',
'target_numerical'],
dtype='object')
for x in categorical_variables:
len(df[x].value_counts())
df = df.drop(features_to_be_dropped, axis=1)
pd.get_dummies(df["Status"])
output:

for x in categorical_variables:
import gc
one_hot = pd.get_dummies(df[x], prefix = x)
df = pd.concat([df, one_hot], axis=1)
df = df.drop(x, axis = 1)
del one_hot
gc.collect()
df[zip_features] = df[zip_features].apply(pd.to_numeric, errors='coerce')
df.info()
output:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10478 entries, 0 to 1477
Columns: 285 entries, ActualShip to DetailState_WY
dtypes: bool(1), float64(11), int64(6), object(13), uint8(254)
memory usage: 5.0+ MB
df
Output:

df
def handle_date_time(input_feature):
times = pd.to_datetime(df[input_feature],
format = "%Y-%m-%d %H:%M:%S.",
errors = "coerce")
df[input_feature + "_year"] = times.dt.year
df[input_feature + "_month"] = times.dt.month
df[input_feature + "_day"] = times.dt.day
df[input_feature + "_hour"] = times.dt.hour
df[input_feature + "_minute"] = times.dt.minute
df[input_feature + "_dayofweek"] = times.dt.dayofweek
df.drop([input_feature], axis = 1, inplace=True)
df.duplicated().sum()
for x in lst_of_dates:
print("in here for", x)
handle_date_time(x),
Output:

df.columns
Output:
Index(['KEY_LOAD_TRACKING', 'Carrier', 'ConsigneeZip', 'ShipperZip', 'PRO',
'DestZip', 'OriginZip', 'ActualTransitTime', 'DeliveryDays',
'ServiceDays',
...
'DetailCreateDate_day', 'DetailCreateDate_hour',
'DetailCreateDate_minute', 'DetailCreateDate_dayofweek', 'AV_CD_year',
'AV_CD_month', 'AV_CD_day', 'AV_CD_hour', 'AV_CD_minute',
'AV_CD_dayofweek'],
dtype='object', length=350)
df.info()
Output:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10478 entries, 0 to 1477
Columns: 350 entries, KEY_LOAD_TRACKING to AV_CD_dayofweek
dtypes: bool(1), float64(23), int64(72), uint8(254)
memory usage: 10.2 MB
df.isnull().sum()
Output:

len(df)
output:
10478
df_training = df[:9000]
df_testing = df[9000:]
df_training.isnull().sum().sum()
df_testing.isnull().sum().sum()
df_training = df_training.reset_index()
del df_training['index']
df_testing = df_testing.reset_index()
del df_testing['index']
X_training = df_training.drop(["target_numerical"], axis = 1)
y_training = df_training["target_numerical"]
len(X_training.iloc[0])
len(y_training)
X_testing = df_testing.drop(["target_numerical"], axis = 1)
y_testing = df_testing["target_numerical"]
len(X_testing.iloc[0])
len(y_testing)
zip_features
output:
['ConsigneeZip', 'ShipperZip', 'DestZip', 'OriginZip']
Model Building
Logistic Regression
from sklearn.linear_model import LogisticRegression
logModel = LogisticRegression()
param_grid = [
{
# 'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
'penalty' : ['l2'],
'C' : np.logspace(-4, 4, 20),
# 'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
'solver' : ['lbfgs'],
'max_iter' : [100, 1000,2500, 5000]
}
]
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(logModel, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_training,y_training)
Output:
Fitting 3 folds for each of 80 candidates, totalling 240 fits
best_clf.best_estimator_
output:
LogisticRegression(C=0.0001)
print (f'Accuracy - : {best_clf.score(X_testing,y_testing):.3f}')
Output:
Accuracy - : 0.699
predictions = best_clf.predict(X_testing)
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print(accuracy_score(y_testing, predictions))
print(recall_score(y_testing, predictions))
print(precision_score(y_testing, predictions))
print(f1_score(y_testing, predictions))
Output:
0.6989174560216509
1.0
0.6989174560216509
0.8227797690163282
from sklearn.metrics import confusion_matrix, classification_report
def plot_confusion_matrix(cf_matrix):
import seaborn as sns
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])
## Display the visualization of the Confusion Matrix.
plt.show()
#Generate the confusion matrix
cf_matrix = confusion_matrix(y_testing, predictions)
print(cf_matrix)
Output:
[[ 0 445]
[ 0 1033]]
plot_confusion_matrix(cf_matrix)
Output:

RandomForest
n_estimators = [5,20,50,100] # number of trees in the random forest
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
random_grid = {'n_estimators': n_estimators,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
}
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid, cv = 5, verbose=100, random_state=35, n_jobs = -1)
rf_random.fit(X_training, y_training)
Output:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1, param_distributions={'min_samples_leaf': [1, 3, 4], 'min_samples_split': [2, 6, 10], 'n_estimators': [5, 20, 50, 100]}, random_state=35, verbose=100)
print ('Random grid: ', random_grid, '\n')
# print the best parameters
print ('Best Parameters: ', rf_random.best_params_, ' \n')
Output:
Random grid: {'n_estimators': [5, 20, 50, 100], 'min_samples_split': [2, 6, 10], 'min_samples_leaf': [1, 3, 4]}
Best Parameters: {'n_estimators': 20, 'min_samples_split': 6, 'min_samples_leaf': 3}
randmf = RandomForestClassifier(n_estimators = 100, min_samples_split = 6, min_samples_leaf= 4, max_features = 'sqrt', max_depth= 120, bootstrap=False)
randmf.fit( X_training, y_training)
Output:
RandomForestClassifier(bootstrap=False, max_depth=120, max_features='sqrt', min_samples_leaf=4, min_samples_split=6)
predictions = rf_random.predict(X_testing)
predictions
Output:
array([1, 1, 1, ..., 0, 1, 0])
print (f'Accuracy - : {rf_random.score(X_testing,y_testing):.3f}')
Output:
Accuracy - : 0.806
print(accuracy_score(y_testing, predictions))
print(recall_score(y_testing, predictions))
print(precision_score(y_testing, predictions))
print(f1_score(y_testing, predictions))
Output:
0.8064952638700947
0.9341723136495643
0.8157227387996618
0.8709386281588447
#Generate the confusion matrix
cf_matrix = confusion_matrix(y_testing, predictions)
print(cf_matrix)
Output:
[[227 218]
[ 68 965]]
plot_confusion_matrix(cf_matrix)
Output:

SVM
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
# defining parameter range
param_grid = {'C': [0.1, 1],
'gamma': [0.1, 0.001],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv = 3)
# fitting the model for grid search
grid.fit(X_training, y_training)
Output:
Fitting 3 folds for each of 4 candidates, totalling 12 fits [CV 1/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.810 total time= 13.3s [CV 2/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.810 total time= 12.3s [CV 3/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.810 total time= 13.6s [CV 1/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.810 total time= 14.7s [CV 2/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.810 total time= 12.6s [CV 3/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.810 total time= 12.8s [CV 1/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.810 total time= 12.6s [CV 2/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.810 total time= 12.7s [CV 3/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.810 total time= 12.7s [CV 1/3] END ......C=1, gamma=0.001, kernel=rbf;, score=0.810 total time= 13.1s [CV 2/3] END ......C=1, gamma=0.001, kernel=rbf;, score=0.810 total time= 13.0s [CV 3/3] END ......C=1, gamma=0.001, kernel=rbf;, score=0.810 total time= 12.9s
GridSearchCV(cv=3, estimator=SVC(), param_grid={'C': [0.1, 1], 'gamma': [0.1, 0.001], 'kernel': ['rbf']}, verbose=3)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
Output:
{'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'} SVC(C=0.1, gamma=0.1)
predictions = grid.predict(X_testing)
print(accuracy_score(y_testing, predictions))
print(recall_score(y_testing, predictions))
print(precision_score(y_testing, predictions))
print(f1_score(y_testing, predictions))
Output:
0.6989174560216509 1.0 0.6989174560216509 0.8227797690163282
#Generate the confusion matrix
cf_matrix = confusion_matrix(y_testing, predictions)
print(cf_matrix)
Output:
[[ 0 445]
[ 0 1033]]
plot_confusion_matrix(cf_matrix)
Output:

Kommentarer