Predict the percentage of an student based on the number of study hours.
import pandas as pd
import numpy as np
import seaborn as sn
import plotly.express as px
import statsmodels
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('https://raw.githubusercontent.com/AdiPersonalWorks/Random/master/student_scores%20-%20student_scores.csv')
data.head()
data.info()
Dataset Has no Null Values
Aim is to get a best fitting line of this kind which is shown ↓,the one in {red} color
fig = px.scatter(
data_frame=data, x='Hours', y='Scores', color='Scores',
trendline='ols',trendline_color_override='red',size = 'Scores',template='plotly_dark'
)
fig.update_layout(title_text = 'Hours of Study v/s Percentage Score')
fig.show()
train = data.iloc[:, :-1].values
test = data.iloc[:, 1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train,test, test_size = 0.2, random_state = 0)
from sklearn.linear_model import LinearRegression
Regressor1 = LinearRegression()
Regressor1.fit(train,test)
line = Regressor1.coef_*train+Regressor1.intercept_
x_range = np.linspace(train.min(),train.max(),100)
y_range = Regressor1.predict(x_range.reshape(-1,1))
fig = go.Figure([
go.Scatter(x = train.squeeze(),y = test,name='train',mode='markers'),
go.Scatter(x = x_range,y = y_range,name='prediction')
])
# fig.update_layout('Prediction using Linear Regression')
fig.show()
# fig = px.scatter(
# data_frame=data, x='Hours', y='Scores', color='Scores',
# trendline=line,
# opacity=0.65,trendline_color_override='red',size = 'Scores',template='plotly_dark'
# )
# fig.update_layout(title_text = 'Hours of Study v/s Percentage Score')
# fig.show()
# # plt.title("Regression line(Train set)",fontsize=15)
# # plt.xlabel("Hours")
# # plt.ylabel("Scores")
# # plt.scatter(X,Y)
# # plt.plot(X,line,color='red');
# # plt.show()
# Generating the random dataset
x_range = np.linspace(train.min(),train.max(),25)
y_pred = Regressor1.predict(x_range.reshape(-1,1))
fig = px.scatter(x = test,y = y_pred,labels={'x': 'ground truth', 'y': 'prediction'})
fig.add_shape(type="line",line=dict(dash = 'dash'),
x0 = test.min(),y0=test.min(),
x1 = test.max(),y1=test.max())
fig.show()
model = LinearRegression()
model.fit(X_train, y_train)
x_range = np.linspace(train.min(), train.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))
fig = go.Figure([
go.Scatter(x=X_train.squeeze(), y=y_train, name='train', mode='markers'),
go.Scatter(x=X_test.squeeze(), y=y_test, name='test', mode='markers'),
go.Scatter(x=x_range, y=y_range, name='prediction')
])
fig.show()
Y_pred = Regressor1.predict(X_test)
print(Y_pred)
df = pd.DataFrame({'Actual':y_test, 'Predicted':Y_pred})
df
sn.distplot(df['Actual'],hist=False,label='Actual',color='red')
sn.distplot(df['Predicted'],hist=False,label='Predicted',color='black')
plt.legend()
plt.show()
sn.jointplot(df['Actual'],df['Predicted'],kind='reg')
print("Training Score: ", Regressor1.score(X_train, y_train))
print("Testing Score: ", Regressor1.score(X_test,y_test))
px.bar(data_frame=df,template='plotly_dark')
import numpy as np
from sklearn import metrics
print(" Mean Absolute Error: ", metrics.mean_absolute_error(y_test, Y_pred))
print(" Mean Squared Error: ",metrics.mean_squared_error(y_test,Y_pred))
print(" Root Mean Squared Error: ",np.sqrt(metrics.mean_squared_error(y_test,Y_pred)))
print(" Explained Variance Score: ",metrics.explained_variance_score(y_test,Y_pred))
hours = 9.25
pred_score = Regressor1.predict([[9.25]])
print(" The Predicted Score of the Student is :",round(pred_score[0]))