import pandas as pd
import numpy as np
import seaborn as sn
import plotly.express as px
import plotly.graph_objects as go
from hulearn.experimental.interactive import InteractiveCharts
import matplotlib.pyplot as plt
%matplotlib inline
iris = pd.read_csv('Iris.csv')
iris.head(2)
iris.columns = ['Id','SepalLength','SepalWidth','PetalLength','PetalWidth','Class']
def splitname(string):
string = string.split('-')
return string[1]
iris['Class'] = iris['Class'].apply(lambda x:splitname(x))
species = list(iris['Class'].unique())
# mapper = {species[i]:i for i in range(len(species))}
# iris['Class'] = iris['Species'].map(mapper)
iris.head()
def plotchart(clf,x,y):
clf = InteractiveCharts(dataf = iris,labels='Class')
clf.add_chart("SepalLength","SepalWidth",legend=False)
clf.add_chart("PetalLength","PetalWidth",legend=False)
fig = px.scatter_3d(iris,x = 'SepalLength',y = 'SepalWidth',z = 'PetalWidth',color='Class')
fig.update_layout(title='The Iris Dataset',
titlefont=dict({'size':28, 'family': 'Courier New'}),
template='plotly',
paper_bgcolor='lightgray',
width=750, height=550,
)
fig.update_layout(scene = dict(xaxis = dict(backgroundcolor = "rgba(200,100,140)",
gridcolor = "black",
showbackground = True,
zerolinecolor = "black",),
yaxis = dict(backgroundcolor = "rgba(100,200,120)",
gridcolor = "black",
showbackground = True,
zerolinecolor = "black"),
zaxis = dict(backgroundcolor = "rgba(200,90,50)",
gridcolor = "black",
showbackground = True,
zerolinecolor = "black"),
),
)
fig.show()
Now as we're gonna apply the KMeans Algorithm on this,the Algorithm is very sensitive to outliers as well as Distributions,so lets have a look at the statistical plots of the features,and then make some changes in the Distribution of the Features!
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
iris['Class']=le.fit_transform(iris['Class'])
iris['Class'].value_counts()
iris.head()
train = iris.drop(['Class'],axis = 1)
import scipy.stats as stats
def makeplots(col):
plt.figure(figsize=(15,7))
plt.subplot(1,3,1)
plt.hist(iris[col],color='salmon')
plt.subplot(1,3,2)
stats.probplot(iris[col],dist='norm',plot=plt)
plt.subplot(1,3,3)
sn.boxplot(iris[col],color='pink')
plt.show()
for i in train.columns:
print(f' Statistical Plots for the Feature : {i} are shown ↓')
makeplots(i)
print("-"*75)
temp = train.copy()
temp['PetalLength'],params = stats.boxcox(temp['PetalLength']+1)
stats.probplot(temp['PetalLength'],dist = 'norm',plot=plt)
data = train.copy()
data['PetalLength'] = data.PetalLength**(1/1.2)
stats.probplot(data['PetalLength'],dist='norm',plot=plt)
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
sc = StandardScaler()
train = iris.iloc[:,[0,1,2,3]].values
train_scaled = sc.fit_transform(train)
inertia_list = []
for i in range(1,11):
clf = KMeans(n_clusters=i)
clf.fit(train)
inertia_list.append(clf.inertia_)
plt.plot(range(1,11),inertia_list,c = 'red')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
scaled_inertia = []
for i in range(1,11):
clf = KMeans(n_clusters=i)
clf.fit(train_scaled)
scaled_inertia.append(clf.inertia_)
scaled_inertia
plt.plot(range(1,11),scaled_inertia,c = 'red',marker = 'X')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
From the above Elbow method we can observe that the optimal value for the number of clusters is 3,well even 4 can be possible,lets check what hyperparameter tuning is giving the result
test = iris['Class'].values
params = {
'n_clusters':[int(x) for x in range(1,11)],
'init':['k-means++', 'random'],
'max_iter':[int(x) for x in np.linspace(100,500,100)],
'tol':[1e-4,1e-3,1e-2,0.025,0.05,0.25,0.5],
'precompute_distances':['auto', True, False],
'algorithm':['auto',"full", "elkan"]
}
clf = RandomizedSearchCV(estimator=KMeans(),param_distributions=params,cv = 5,n_iter=10,n_jobs=-1,
return_train_score=False,scoring = 'accuracy')
clf.fit(train,test)
clf.best_estimator_
So after the hyperparameter tuning step we can observe that the max clusters selected are 3,now lets make the predictions
inertia_list = []
for i in range(1,11):
clf = KMeans(n_clusters=i)
clf.fit(train)
inertia_list.append(clf.inertia_)
kmeans = KMeans(n_clusters = 3, init = 'k-means++',max_iter = 300, n_init = 10, random_state = 0)
predictions = kmeans.fit_predict(train)
predictions
kmeans.cluster_centers_
kmeans.cluster_centers_[0][:3][0]
kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1]
kmeans.cluster_centers_[:,0][0]
fig = plt.figure(figsize=(10, 8))
plt.scatter(train[predictions == 0, 0], train[predictions == 0, 1], s = 100, c = 'red', label = 'Iris-setosa')
plt.scatter(train[predictions == 1, 0], train[predictions == 1, 1], s = 100, c = 'blue', label = 'Iris-versicolour')
plt.scatter(train[predictions == 2, 0], train[predictions == 2, 1],s = 100, c = 'green', label = 'Iris-virginica')
# Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1],
s = 300, c = 'yellow', label = 'Centroids', marker='*')
plt.legend()
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111,projection = '3d')
ax.scatter(train[predictions==0,0],train[predictions==0,1],train[predictions==0,2],s = 50,color = 'blue',label = 'cluster1')
ax.scatter(train[predictions==1,0],train[predictions==1,1],train[predictions==1,2],s = 50,color = 'red',label = 'cluster2')
ax.scatter(train[predictions==2,0],train[predictions==2,1],train[predictions==2,2],s = 50,color = 'green',label = 'cluster3')
ax.scatter(kmeans.cluster_centers_[0][:3][0],kmeans.cluster_centers_[0][:3][1],kmeans.cluster_centers_[0][:3][2]
,s = 300,color = 'yellow',label = 'centroid1',marker = 'X')
ax.scatter(kmeans.cluster_centers_[1][:3][0],kmeans.cluster_centers_[1][:3][1],kmeans.cluster_centers_[1][:3][2]
,s = 300,color = 'violet',label = 'centroid2',marker = 'X')
ax.scatter(kmeans.cluster_centers_[2][:3][0],kmeans.cluster_centers_[2][:3][1],kmeans.cluster_centers_[2][:3][2]
,s = 300,color = 'red',label = 'centroid3',marker = 'X')
# ax.scatter(kmeans.cluster_centers_[:,0],0,0,color = 'yellow',s = 300,label = 'Centriod1')
# ax.scatter(kmeans.cluster_centers_[:,1],0,0,color = 'yellow',s = 300,label = 'Centriod1')
# ax.scatter(kmeans.cluster_centers_[:,2],0,0,color = 'yellow',s = 300,label = 'Centriod1')
# ax.scatter(kmeans.cluster_centers_[:,0][0],kmeans.cluster_centers_[:,0][1],kmeans.cluster_centers_[:,0][2]
# ,s = 300,color = 'yellow',label = 'centroid')
# ax.scatter(kmeans.cluster_centers_[:,1][0],kmeans.cluster_centers_[:,1][1],kmeans.cluster_centers_[:,1][2]
# ,s = 300,color = 'black',label = 'centroid')
# ax.scatter(kmeans.cluster_centers_[:,2][0],kmeans.cluster_centers_[:,2][1],kmeans.cluster_centers_[:,2][2]
# ,s = 300,color = 'brown',label = 'centroid')
ax.set_xlabel('Setosa')
ax.set_ylabel('Versicolor')
ax.set_zlabel('Virginca')
ax.legend()
plt.show()