import pandas as pd
import numpy as np
import seaborn as sn
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as py
import matplotlib.pyplot as plt
%matplotlib inline
iris = pd.read_csv('Iris.csv')
iris.head()
iris.isnull().sum()
iris['Species'].unique()
def splitkeyword(string):
string = string.split('-')
return string[1]
iris['Species'] = iris['Species'].apply(lambda x:splitkeyword(x))
iris.head()
### Creating a mapper for the Species,as for classification we need numerical data!==>
species_list = list(iris['Species'].unique())
mapper = {species_list[i]:i+1 for i in range(len(species_list))}
mapper
iris['Class'] = iris['Species'].map(mapper)
iris.head()
iris['Class'].value_counts()
sn.countplot(iris['Class'],palette='plasma')
## dropping the id column
iris = iris.drop(['Id'],axis=1)
iris.head()
# For now i will be dropping the species column,as we're about to start the model building
# once i get the predicted results,i will remap the Actual species and Predicted Species.
species = list(iris['Species'])
iris = iris.drop(['Species'],axis = 1)
iris.head()
# If we make an observation here,the Class Feature is in continous order,i.e the values are not jumbled
# First 50 rows are of class 1,next 50 are of class 2,and next 50 are of class 3,so its in an order we can say
# When we're working on Tree based Classifiers,they generally try to predict all the classes perfectly!,
# We cannot know how the model has performed,so a kind of Shuffling must be done,so that the model can get some unordered
# Data in its way and then we can get the correct intuation of this!,so i'll be using the sample(frac = 1) of pandas
# frac = 1 means we're trying to shuffle the data row wise.
iris = iris.sample(frac=1).reset_index()
iris = iris.drop(['index'],axis = 1)
iris.head()
## renaming the column names
iris.columns = ['SepalLength','SepalWidth','PetalLength','PetalWidth','Class']
iris.head(3)
## Splitting the data into training and testing
test = iris['Class']
train = iris.drop(['Class'],axis = 1)
train.head(2)
import scipy.stats as stats
def makeplots(col):
plt.figure(figsize=(15,7))
plt.subplot(1,3,1)
plt.hist(iris[col],color='salmon')
plt.subplot(1,3,2)
stats.probplot(iris[col],dist='norm',plot=plt)
plt.subplot(1,3,3)
sn.boxplot(iris[col],color='pink')
plt.show()
for i in train.columns:
print(f' Statistical Plots for the Feature : {i} are shown ↓')
makeplots(i)
print("-"*75)
From the above statistical plots we can observe that the Distribution of the feature "Petal Length" is not a perfect fit for the Gaussian Distribution,and also Petal Width,They are totally of some other distribution and not following the Gaussian Distribution,well as we are working with Tree Based Classifiers,Scaling the Data is not that important,as these classfiers split their nodes based on a certain Decision
Logic,that is why they're said to be Decision Trees,lets first try brute force,if the results are not good,then we'll try to adjust the Distribution of the Features!
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV,train_test_split
from sklearn import tree
from sklearn import metrics
# Splitting the data into train and test 80% training,20% validation
X_train, X_test, y_train, y_test = train_test_split(train,test,test_size = 0.2)
X_train.shape,X_test.shape
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(train,test)
classnames=['setosa', 'versicolor', 'virginica']
plt.figure(figsize = (16,9))
tree.plot_tree(clf,filled=True,feature_names=train.columns,class_names=classnames)
plt.show()
Initially at the root of the tree we have 150 samples,50 samples of each class,i have set the criterion of the Decision Tree Classifier
as gini
,so the first split in the tree is made on the basis of the Petal Length
,as we can observe the root node of the tree,so based on Petal Length<=2.45
,the classifier made 2 splits,the node on the left,which is having "[50,0,0]" which means all the Setosa
samples have the Petal Length <= 2.45
,next on the Righthand side we have [0,50,50] which means as we've sperated the Setosa category,we're left with Versicolor and Virginica
,so based on the Petal Length
,i.e PetalLength<=4.95
,we have seperated the 2 classes as "VersiColor" and "Virginca",as shown,even they have some kind of impurities,as [0,49,5],in case of Versicolor,we have 49 samples of Versicolor
and 5 samples of Virginca
,so this is still impure,we can go on splitting the nodes uptil we get the perfect split,here darker the color of the node,more are the number of species of that class!,so this tree grows even more when more data is provided and basically the Decision Tree Classifier faces the problem of Overfitting as it tends to go till the leaf nodes,the one which are the most darkest in color,we can avoid this by Pruning the tree,for bigger datasets,this works..amazingly..and also we have to always try to prune the tree so that this will not face the Overfitting problem!
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(X_train,y_train)
classnames=['setosa', 'versicolor', 'virginica']
plt.figure(figsize = (16,9))
tree.plot_tree(clf,filled=True,feature_names=train.columns,class_names=classnames)
plt.show()
path = clf.cost_complexity_pruning_path(X_train,y_train)
ccp_alphas = path.ccp_alphas
alpha_list = []
for i in ccp_alphas:
clf = DecisionTreeClassifier(ccp_alpha=i)
clf.fit(X_train,y_train)
alpha_list.append(clf)
train_score = [clf.score(X_train,y_train) for clf in alpha_list]
test_score = [clf.score(X_test,y_test) for clf in alpha_list]
plt.plot(ccp_alphas,train_score,marker = '+',label = 'TrainingScore',
drawstyle = "steps-post",c= 'red')
plt.plot(ccp_alphas,test_score,marker = 'X',label = 'TestingScore',
drawstyle = "steps-post",c = 'black')
plt.xlabel('ccp_alpha')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
From the above graph we can observe that initially both the training and testing
Score start from a very high value which is 1.0,then as we increase the value of ccp_alpha
,the training Score starts to decrease,as well as testing score,but there is a point in the range of [0.00->0.05],where the testing score is at its peak and the training score is little bit less than that,for a good machine learning model,we must always try to maintain low bias and low variance
,so we can find that point,somewhere between 0.00->0.05
,so we will use Hyperparameter Tuning and try to find the value of that ccp_alpha
at which we attaing the concept of low bias
and low variance
clf = DecisionTreeClassifier(criterion='gini',ccp_alpha=0.025)
clf.fit(X_train,y_train)
classnames=['setosa', 'versicolor', 'virginica']
plt.figure(figsize = (16,9))
tree.plot_tree(clf,filled=True,feature_names=train.columns,class_names=classnames)
plt.show()
Now let us try and visualise the decision boundaries of how sepal parameters and petal parameters alone can do the trick of overfitting,let us visualise their decision boundaries,and try and figure out that which is the essential parameter is it petal data or sepal data,lets check it out!
iris.columns
sepal_data = iris[['SepalLength', 'SepalWidth']]
petal_data = iris[['PetalLength', 'PetalWidth']]
classes = iris['Class']
sp_length = sepal_data['SepalLength'].values
sp_width = sepal_data['SepalWidth'].values
sp_array = np.c_[sp_length,sp_width]
pt_length = petal_data['PetalLength'].values
pt_width = petal_data['PetalWidth'].values
pt_array = np.c_[pt_length,pt_width]
def getmeshgrid(data):
x_min,x_max = data[:,0].min()-1,data[:,0].max()+1
y_min,y_max = data[:,1].min()-1,data[:,1].max()+1
return np.meshgrid((np.arange(x_min,x_max,0.01)),
(np.arange(y_min,y_max,0.01)))
clf = DecisionTreeClassifier(criterion='gini',max_depth=3)
clf.fit(sepal_data,classes)
xx,yy = getmeshgrid(sp_array)
predicted = clf.predict(np.c_[xx.ravel(),yy.ravel()]).reshape(xx.shape)
plt.pcolormesh(xx,yy,predicted,cmap='autumn')
plt.scatter(sp_array[:,0],sp_array[:,1],c = classes,
s = 100,cmap='plasma',edgecolors='black',linewidths=2.5)
def getmeshgrid(data):
x_min,x_max = data[:,0].min()-1,data[:,0].max()+1
y_min,y_max = data[:,1].min()-1,data[:,1].max()+1
return np.meshgrid((np.arange(x_min,x_max,0.01)),
(np.arange(y_min,y_max,0.01)))
clf = DecisionTreeClassifier(criterion='gini',max_depth=3)
clf.fit(petal_data,classes)
xx,yy = getmeshgrid(pt_array)
predicted = clf.predict(np.c_[xx.ravel(),yy.ravel()]).reshape(xx.shape)
plt.pcolormesh(xx,yy,predicted,cmap='autumn')
plt.scatter(pt_array[:,0],pt_array[:,1],c = classes,
s = 100,cmap='plasma',edgecolors='black',linewidths=2.5)
def getmeshgrid(data):
x_min,x_max = data[:,0].min()-1,data[:,0].max()+1
y_min,y_max = data[:,1].min()-1,data[:,1].max()+1
return np.meshgrid((np.arange(x_min,x_max,0.01)),
(np.arange(y_min,y_max,0.01)))
clf = DecisionTreeClassifier(criterion='gini',max_depth=50)
clf.fit(sepal_data,classes)
xx,yy = getmeshgrid(sp_array)
predicted = clf.predict(np.c_[xx.ravel(),yy.ravel()]).reshape(xx.shape)
plt.pcolormesh(xx,yy,predicted,cmap='autumn')
plt.scatter(sp_array[:,0],sp_array[:,1],c = classes,
s = 100,cmap='plasma',edgecolors='black',linewidths=2.5)
def getmeshgrid(data):
x_min,x_max = data[:,0].min()-1,data[:,0].max()+1
y_min,y_max = data[:,1].min()-1,data[:,1].max()+1
return np.meshgrid((np.arange(x_min,x_max,0.01)),
(np.arange(y_min,y_max,0.01)))
clf = DecisionTreeClassifier(criterion='gini',max_depth=50)
clf.fit(petal_data,classes)
xx,yy = getmeshgrid(pt_array)
predicted = clf.predict(np.c_[xx.ravel(),yy.ravel()]).reshape(xx.shape)
plt.pcolormesh(xx,yy,predicted,cmap='autumn')
plt.scatter(pt_array[:,0],pt_array[:,1],c = classes,
s = 100,cmap='plasma',edgecolors='black',linewidths=2.5)
a = [round(x,3) for x in np.linspace(0,0.05,10)]
a
clf = DecisionTreeClassifier(criterion='gini',ccp_alpha=0.025)
clf.fit(petal_data,classes)
x_min,x_max = pt_array[:,0].min()-1,pt_array[:,0].max()+1
y_min,y_max = pt_array[:,1].min()-1,pt_array[:,1].max()+1
xx,yy = np.meshgrid((np.arange(x_min,x_max,0.01)),
(np.arange(y_min,y_max,0.01)))
y_ = np.arange(y_min,y_max,0.01)
predicted_class = clf.predict(np.c_[xx.ravel(),yy.ravel()]).reshape(xx.shape)
trace1 = go.Heatmap(x=xx[0], y=y_,
z=predicted_class,
colorscale='Jet',
showscale=True)
trace2 = go.Scatter(x = pt_array[:,0],y = pt_array[:,1],
mode='markers',
showlegend=False,
marker=dict(size=10,
color=classes,
colorscale='Jet',
reversescale = True,
line=dict(color='black', width=1))
)
layout= go.Layout(
autosize= True,
title= 'Class 3 Surface Probability',
hovermode= 'closest',
showlegend= False)
data = [trace1,trace2]
fig3 = go.Figure(data = data,layout=layout)
py.iplot(fig3)
clf = DecisionTreeClassifier(criterion='gini',ccp_alpha=0.025)
clf.fit(sepal_data,classes)
x_min,x_max = sp_array[:,0].min()-1,sp_array[:,0].max()+1
y_min,y_max = sp_array[:,1].min()-1,sp_array[:,1].max()+1
xx,yy = np.meshgrid((np.arange(x_min,x_max,0.01)),
(np.arange(y_min,y_max,0.01)))
y_ = np.arange(y_min,y_max,0.01)
predicted_class = clf.predict(np.c_[xx.ravel(),yy.ravel()]).reshape(xx.shape)
trace1 = go.Heatmap(x=xx[0], y=y_,
z=predicted_class,
colorscale='Jet',
showscale=True)
trace2 = go.Scatter(x = sp_array[:,0],y = sp_array[:,1],
mode='markers',
showlegend=False,
marker=dict(size=10,
color=classes,
colorscale='Jet',
reversescale = True,
line=dict(color='black', width=1))
)
layout= go.Layout(
autosize= True,
title= 'Class 3 Surface Probability',
hovermode= 'closest',
showlegend= False)
data = [trace1,trace2]
fig3 = go.Figure(data = data,layout=layout)
py.iplot(fig3)
parameters:
criterion
: The function to measure the quality of a split. Supported criteria are "gini
" for the Gini impurity and "entropy
" for the information gain.splitter
: The strategy used to choose the split at each node. Supported strategies are "best
" to choose the best split and "random
" to choose the best random split.max_depth
: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split
samples.min_samples_split
: The minimum number of samples required to split an internal node.min_samples_leaf
: The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf
training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.min_weight_fraction_leaf
: The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.max_features
: The number of features to consider when looking for the best split.max_leaf_nodes
: Grow a tree with max_leaf_nodes
in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.min_impurity_decrease
: A node will be split if this split induces a decrease of the impurity greater than or equal to this value.min_impurity_split
: Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.params = {
'criterion':["gini","entropy"],
'splitter': ["best", "random"],
'max_depth':[int(x) for x in np.linspace(1,20,5)],
'min_samples_split':[2,4,6,8],
'min_samples_leaf':[2,4,6,8],
'ccp_alpha':[round(x,3) for x in np.linspace(0,0.05,10)]
}
scores = []
clf = RandomizedSearchCV(estimator=DecisionTreeClassifier(),param_distributions=params,cv = 5,n_iter=10,n_jobs=-1,
return_train_score=False)
clf.fit(X_train,y_train)
clf.best_estimator_
clf.best_score_
clf = DecisionTreeClassifier(ccp_alpha=0.028, criterion='entropy', max_depth=5,
min_samples_leaf=6)
clf.fit(X_train,y_train)
clf.score(X_train,y_train)
clf = DecisionTreeClassifier(ccp_alpha=0.028, criterion='entropy', max_depth=5,
min_samples_leaf=6)
clf.fit(X_train,y_train)
classnames=['setosa', 'versicolor', 'virginica']
plt.figure(figsize = (16,9))
tree.plot_tree(clf,filled=True,feature_names=train.columns,class_names=classnames)
plt.show()
clf.score(X_test,y_test)
prediction = clf.predict(X_test)
cn = metrics.confusion_matrix(y_test,prediction)
sn.heatmap(cn,annot=True,cmap='plasma')
print(metrics.classification_report(y_test,prediction))
iris.head()
prediction = []
train_test = np.array(train)
for i in range(len(train_test)):
prediction.append(clf.predict([train_test[i]]))
prediction = np.array(prediction)
iris['Predicted'] = prediction
iris.head()
mapper = {1:'setosa',2:'versicolor',3:'virginica'}
iris['Actual_class'] = iris['Class'].map(mapper)
iris['Predicted_class'] = iris['Predicted'].map(mapper)
iris.head()
iris['Predicted_class'].value_counts()
sn.countplot(data = iris,x = 'Actual_class',palette= 'plasma')
sn.countplot(data = iris,x = 'Predicted_class',palette= 'plasma')
final_prediction = clf.predict(train)
cn = metrics.confusion_matrix(test,final_prediction)
sn.heatmap(cn,annot=True,cmap='plasma')
print(metrics.classification_report(test,final_prediction))
clf = DecisionTreeClassifier(ccp_alpha=0.028, criterion='entropy', max_depth=5,
min_samples_leaf=6)
clf.fit(train,test)
classnames=['setosa', 'versicolor', 'virginica']
plt.figure(figsize = (16,9))
tree.plot_tree(clf,filled=True,feature_names=train.columns,class_names=classnames)
plt.show()