#import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# read the data
df = pd.read_csv('SampleSuperstore.csv')
df.head(5)
#Check the shape
df.shape
#check for the missing value
df.isnull().sum()
To answer this question let's make a plot on the Profit/Loss across all the states of USA.
Changing the State Column as State Code
state_code = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','District of Columbia': 'WA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}
df['state_code'] = df.State.apply(lambda x: state_code[x])
#import few more libraries
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
Plotting in Geo-Spatial Graph for better Visualization
state_data = df[['Sales', 'Profit', 'state_code']].groupby(['state_code']).sum()
fig = go.Figure(data=go.Choropleth(
locations=state_data.index, # Spatial coordinates
z = state_data.Profit, # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = [[0, 'rgb(255,0,0)'], [0.25, 'rgb(255,255,255)'], [0.45, 'rgb(124,208,247)'], [0.6, 'rgb(97,255,140)'], [1, 'rgb(8,181,0)']],
# reversescale = True,
colorbar_title = 'Profits in USD',
))
fig.update_layout(
title_text = 'Total State-Wise Profit/Loss',
geo_scope='usa', # limite map scope to USA
height=800,
)
fig.show()
The observations from the above graph are-
The states like California, New York and Washington makes the highest profit of around 30k signified by the green colour.
The blue and sky blue color signifies the states with average profit of around 25K and mainly in the north-east and south-east regrion.
Thus, the most weaker areas are Texas, Pennslyvania and Ohio.
#Checking for duplicates
df.duplicated().sum()
#removing duplicates
df = pd.DataFrame.drop_duplicates(df)
df.duplicated().sum()
fig, ax= plt.subplots(1, 2, figsize=(12,8))
#sns.boxplot(x=df["Sales"], ax=ax[0])
sns.boxplot(x=df["Discount"], ax=ax[0])
sns.boxplot(x=df["Quantity"], ax=ax[1])
Note- The best way to check for outliers are Boxplot, and while plotting so one thing have to be kept in mind that not all Numerical data will have outliers. e.g- in this case Sales and Profit column don't have outliers rather they have a wide range of data. But on the other hand Quantity and Discount have outliers as seen from the above plot.
by Percentile Technique
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df_new=df[~((df<(Q1-1.5*IQR))|(df>(Q3+1.5*IQR))).any(axis=1)]
#new shape
df_new.shape
#After removing the Outliers
fig, ax= plt.subplots(1, 2, figsize=(12,8))
sns.boxplot(x=df_new["Discount"], ax=ax[0])
sns.boxplot(x=df_new["Quantity"], ax=ax[1])
#sns.boxplot(x=df_new["Quantity"], ax=ax[2])
# drop the unnecessary columns
df_new.drop(['Postal Code'], axis=1)
state_code = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','District of Columbia': 'WA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}
df_new['state_code'] = df_new.State.apply(lambda x: state_code[x])
# for State-Wise Sales
state_data = df_new[['Sales', 'Profit', 'state_code']].groupby(['state_code']).sum()
fig = go.Figure(data=go.Choropleth(
locations=state_data.index,
z = state_data.Sales,
locationmode = 'USA-states',
colorscale = 'blues',
colorbar_title = 'Sales in USD',
))
fig.update_layout(
title_text = 'Total State-Wise Sales',
geo_scope='usa',
height=800,
)
The Observations from the above graph are:
The company has the highest sales in the state of California (shown in dark blue), selling around 130K of goods.
New York (bright blue, top right) is the state with the second highest sales, with more that 60k worth of goods sold.
# for State-Wise Profit/Loss
fig = go.Figure(data=go.Choropleth(
locations=state_data.index, # Spatial coordinates
z = state_data.Profit, # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale= 'reds',
# reversescale = True,
colorbar_title = 'Profits in USD',
))
fig.update_layout(
title_text = 'Total State-Wise Profit/Loss',
geo_scope='usa', # limite map scope to USA
height=800,
)
fig.show()
The observations from the above plot are:
Thus, one has to be very careful while Pre-Processing the data. Since, this is my first project on Exploratory Data Analysis so I will say later what modify can be done to rectify this problem.
To get to know more insight on the Sales and Profit/Loss on Category and Region- We plot them in the barchart.
#plotting the sales and profit by Category
fig, ax = plt.subplots(1,2, figsize=(12,6))
fig.suptitle("Sales and Profit By Category", fontsize=20)
df_by_category = df.groupby("Category")[["Sales", "Profit", "Region"]].sum()
sns.barplot(x=df_by_category.index, y=df_by_category["Sales" ], ax=ax[0])
sns.barplot(x=df_by_category.index, y=df_by_category["Profit"], ax=ax[1])
fig.tight_layout()
fig.subplots_adjust(top=0.85)
From, the above plot it is observed that while having a reasonable Sales margin the Furniture is gaining a minimun profit.
#plotting the Sales and Profit by Region
df_groupby_region = df.groupby("Region", sort=True)[["Sales", "Profit"]].sum()
fig, ax = plt.subplots(1 ,2, figsize=(10,5))
ax[0].set_title("Sales by Region")
sns.barplot(x=df_groupby_region.index, y=df_groupby_region["Sales"], ax=ax[0])
ax[1].set_title("Profit by Region")
sns.barplot(x=df_groupby_region.index, y=df_groupby_region["Profit"], ax=ax[1])
plt.tight_layout()
The Western region shows maximum Sales and Profit and the South shows the minimun profit while the Central showing the Lowest Profit among all.
Counting the distribution of Products Catogorically over different Region-
plt.figure(figsize=(10, 6))
sns.countplot(x="Region", hue="Category", data=df_new)
Among all the Regions the Office Supplies Product shows the Maximum Count.
import plotly.graph_objects as go
labels = ['Oxygen','Hydrogen','Carbon_Dioxide','Nitrogen']
values = [4500, 2500, 1053, 500]
# Use `hole` to create a donut-like pie chart
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.show()
profit=df[df.Profit>0]
loss=df[df.Profit<0]
# percentage share of total profit by each sub-category
fig = go.Figure(data = [go.Pie(values = profit.groupby('Sub-Category').agg('sum').Profit,
labels = profit.groupby('Sub-Category').agg('sum').index,hole = .3)])
fig.show()
# # plt.pie(profit.groupby('Sub-Category').agg('sum').Profit,radius=3.12,labels=profit.groupby('Sub-Category').agg('sum').index,
# # autopct='%1.2f%%')
# plt.title('Profit pie',fontdict=dict(fontsize=36),pad=100,loc='center')
# plt.show()
From the above Donut-Chart it is observed that the Binder shows the highest Profit followed by Phone and Copiers.
fig = go.Figure(data = [go.Pie(values = np.abs(loss.groupby('Sub-Category').agg('sum').Profit),
labels = loss.groupby('Sub-Category').agg('sum').index,hole = .3)])
fig.show()
From the Loss-Donut it is observed that Binder, Tables and Machines shows remarkable loss. The Copier, Paper and Labels shows no loss. This might be since they are Office Accessories.
df['Profit1'] = df.Profit.apply(lambda x: x if (x > 0) else 0)
fig = px.bar(df,x = 'Ship Mode', y = 'Profit1', title = 'Profit in Each Shipping Category'.upper(),
color = 'Ship Mode', hover_data=['Profit', 'Sub-Category'])
fig.update_layout(
autosize = True,
width = 800,
yaxis = dict(title='Total Profit'),
xaxis = dict(title='Shipping Class'),
)
fig.show()
df['loss'] = df.Profit.apply(lambda x: x if (x < 0) else 0)
fig = px.bar(df,x = 'Ship Mode', y = "loss", title = 'Losses in Each Shipping Category'.upper(),
color = 'Ship Mode', hover_data=['loss', 'Sub-Category'])
fig.update_layout(
autosize = True,
width = 800,
yaxis = dict(title='Total Loss'),
xaxis = dict(title='Shipping Class'),
)
fig.show()
fig = px.bar(df,x = 'Ship Mode', y = "Sales", title = 'Sales in Each Shipping Category'.upper(),
color = 'Ship Mode', hover_data=['Sales', 'Sub-Category'])
fig.update_layout(
autosize = True,
width = 800,
yaxis = dict(title='Total Sales'),
xaxis = dict(title='Shipping Class'),
)
fig.show()
In every aspect Standard Class shows the Highest Variation.
Conclusion- This is a very simple Exploratory Data Analysis I performed. Many more aspects can be obtained from more detailed Analysis.