Importing Required Libraries¶

import pandas as pd
import numpy as np
import seaborn as sn
import plotly.express as px
import statsmodels
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline

Data Reading¶

data = pd.read_csv('https://raw.githubusercontent.com/AdiPersonalWorks/Random/master/student_scores%20-%20student_scores.csv')

Data Display¶

data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Hours   25 non-null     float64
 1   Scores  25 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 528.0 bytes

Dataset Has no Null Values

Plotting the Data¶

Aim is to get a best fitting line of this kind which is shown ↓,the one in {red} color

fig = px.scatter(
    data_frame=data, x='Hours', y='Scores', color='Scores',
    trendline='ols',trendline_color_override='red',size = 'Scores',template='plotly_dark'
)
fig.update_layout(title_text = 'Hours of Study v/s Percentage Score')
fig.show()

train = data.iloc[:, :-1].values
test = data.iloc[:, 1].values

Splitting the entire DataSet into Train and Test Data Set¶

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train,test, test_size = 0.2, random_state = 0)

Build the Regression Model¶

from sklearn.linear_model import LinearRegression
Regressor1 = LinearRegression()
Regressor1.fit(train,test)

LinearRegression()

line = Regressor1.coef_*train+Regressor1.intercept_
x_range = np.linspace(train.min(),train.max(),100)
y_range = Regressor1.predict(x_range.reshape(-1,1))
fig = go.Figure([
    go.Scatter(x = train.squeeze(),y = test,name='train',mode='markers'),
    go.Scatter(x = x_range,y = y_range,name='prediction')
])
# fig.update_layout('Prediction using Linear Regression')
fig.show()

# fig = px.scatter(
#     data_frame=data, x='Hours', y='Scores', color='Scores',
#     trendline=line,
#     opacity=0.65,trendline_color_override='red',size = 'Scores',template='plotly_dark'
# )
# fig.update_layout(title_text = 'Hours of Study v/s Percentage Score')
# fig.show()


# # plt.title("Regression line(Train set)",fontsize=15)
# # plt.xlabel("Hours")
# # plt.ylabel("Scores")
# # plt.scatter(X,Y)
# # plt.plot(X,line,color='red');
# # plt.show()

# Generating the random dataset

x_range = np.linspace(train.min(),train.max(),25)
y_pred = Regressor1.predict(x_range.reshape(-1,1))

fig = px.scatter(x = test,y = y_pred,labels={'x': 'ground truth', 'y': 'prediction'})
fig.add_shape(type="line",line=dict(dash = 'dash'),
             x0 = test.min(),y0=test.min(),
             x1 = test.max(),y1=test.max())
fig.show()

Prediction Chart¶

model = LinearRegression()
model.fit(X_train, y_train)

x_range = np.linspace(train.min(), train.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))


fig = go.Figure([
    go.Scatter(x=X_train.squeeze(), y=y_train, name='train', mode='markers'),
    go.Scatter(x=X_test.squeeze(), y=y_test, name='test', mode='markers'),
    go.Scatter(x=x_range, y=y_range, name='prediction')
])
fig.show()

Predict the output(Y_axis values) of Test DataSet¶

Y_pred = Regressor1.predict(X_test)
print(Y_pred)

[17.14737849 33.76624426 74.8246185  26.92318188 60.16091341]

df = pd.DataFrame({'Actual':y_test, 'Predicted':Y_pred})
df

sn.distplot(df['Actual'],hist=False,label='Actual',color='red')
sn.distplot(df['Predicted'],hist=False,label='Predicted',color='black')
plt.legend()
plt.show()

sn.jointplot(df['Actual'],df['Predicted'],kind='reg')

<seaborn.axisgrid.JointGrid at 0x2184208c888>

How well the Linear Regression Model Fits on Training and Test DataSet¶

print("Training Score: ", Regressor1.score(X_train, y_train))
print("Testing Score: ", Regressor1.score(X_test,y_test))

Training Score:  0.9512837351709387
Testing Score:  0.9491748734859172

Bar Plot to Observe the difference Between Actual and Predicted Outcome on Test Data Set¶

px.bar(data_frame=df,template='plotly_dark')

Evaluating the Model¶

import numpy as np
from sklearn import metrics

print(" Mean Absolute Error: ", metrics.mean_absolute_error(y_test, Y_pred))
print(" Mean Squared Error: ",metrics.mean_squared_error(y_test,Y_pred))
print(" Root Mean Squared Error: ",np.sqrt(metrics.mean_squared_error(y_test,Y_pred)))
print(" Explained Variance Score: ",metrics.explained_variance_score(y_test,Y_pred))

 Mean Absolute Error:  4.071877793635605
 Mean Squared Error:  20.138948129940175
 Root Mean Squared Error:  4.487643939746131
 Explained Variance Score:  0.9515224335188082

Predicting the Score¶

hours = 9.25
pred_score = Regressor1.predict([[9.25]])
print(" The Predicted Score of the Student is :",round(pred_score[0]))

 The Predicted Score of the Student is : 93.0

	Hours	Scores
0	2.5	21
1	5.1	47
2	3.2	27
3	8.5	75
4	3.5	30

	Actual	Predicted
0	20	17.147378
1	27	33.766244
2	69	74.824618
3	30	26.923182
4	62	60.160913