Skip to content

Instantly share code, notes, and snippets.

@ShambhaviPataskar
Created September 8, 2021 14:50
Show Gist options
  • Save ShambhaviPataskar/24d2eac4453a62808ec83a5c2768afe4 to your computer and use it in GitHub Desktop.
Save ShambhaviPataskar/24d2eac4453a62808ec83a5c2768afe4 to your computer and use it in GitHub Desktop.
A sales analysis using jupyter notebook
#!/usr/bin/env python
# coding: utf-8
# # Sales Analysis
# ### import necessary libraries
# In[1]:
import pandas as pd
import os
# #### Task #1: merging 12 months of sales data into a single file
# In[2]:
df = pd.read_csv("./Sales_Data/Sales_April_2019.csv")
files = [file for file in os.listdir('./Sales_Data')]
all_months_data = pd.DataFrame()
for file in files:
df = pd.read_csv("./Sales_Data/"+file)
all_months_data = pd.concat([all_months_data, df])
all_months_data.head()
print(all_months_data)
all_months_data.to_csv("all_data.csv", index = False)
# # Read in the updated dataframe
# all_data.head(100)
# In[3]:
all_data = pd.read_csv("all_data.csv")
all_data.head()
# ### Clean up the data
# #### Drop rows of NaN
# In[4]:
nan_df = all_data[all_data.isna().any(axis=1)]
nan_df.head()
all_data = all_data.dropna(how = 'all')
all_data.head()
# #### Find 'Or' and delete it
# In[5]:
all_data = all_data[all_data['Order Date'].str[0:2] != 'Or']
# ##### Convert columns to the correct type
# In[6]:
all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered'])# Make int
all_data['Price Each'] = pd.to_numeric(all_data['Price Each']) # Make float
all_data.head()
# # Agument data with additional columns
# #### Task #2: Add month column
# In[7]:
all_data['Month'] = all_data["Order Date"].str[0:2]
all_data['Month'] = all_data['Month'].astype('int32')
all_data.head()
# #### Task #3: Add a Sales column
# In[8]:
all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']
all_data.head()
# #### Add a city column
# In[9]:
# Let's use .apply()
def get_city(address):
return address.split(',')[1]
def get_state(address):
return address.split(',')[2].split(' ')[1]
all_data['City'] = all_data['Purchase Address'].apply(lambda x: f"{get_city(x)} ({get_state(x)})")
all_data.head()
# ##### Q1) What was the best month for sales? How much was earned that month?
# In[38]:
results = all_data.groupby('Month').sum()
# In[39]:
import matplotlib.pyplot as plt
months = range(1, 13)
plt.bar(months, results['Sales'], color = 'crimson')
plt.xticks(months)
plt.ylabel('Sales in USD ($)')
plt.xlabel('Month number')
plt.show()
#December was the best month for sales
# ##### Q2) What city had the highest number of sales?
# In[41]:
results = all_data.groupby('City').sum()
results
# In[42]:
import matplotlib.pyplot as plt
cities = [city for city, df in all_data.groupby('City')]
plt.bar(cities, results['Sales'], color = 'dodgerblue')
plt.xticks(cities, rotation = "vertical", size='8')
plt.ylabel('Sales in USD ($)')
plt.xlabel('City Name')
plt.show()
#Answer: San Francisco(CA)
# ##### Q3) What time should we display advertisements to maximize likelihood of customers buying product?
# In[14]:
all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
all_data['Hour'] = all_data['Order Date'].dt.hour
all_data['Minute'] = all_data['Order Date'].dt.minute
# In[15]:
all_data.head()
# In[25]:
hours = [hour for hour, df in all_data.groupby('Hour')]
plt.plot(hours, all_data.groupby(['Hour']).count(), color = 'darkorange')
plt.xticks(hours)
plt.grid()
plt.xlabel("Time (in hours)")
plt.ylabel('Number of Orders')
all_data.groupby(['Hour']).count()
plt.show()
#Answer: Around 12pm (12) and/or 7pm (19)
# ##### Q4) What products are most often sold together?
# In[43]:
df = all_data[all_data['Order ID'].duplicated(keep = False)]
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ',' .join(x))
df = df[['Order ID', 'Grouped']].drop_duplicates()
df.head()
# In[18]:
from itertools import combinations
from collections import Counter
count = Counter()
for row in df['Grouped']:
row_list = row.split(',')
count.update(Counter(combinations(row_list, 3)))
for key, value in count.most_common(10):
print(key, value)
# ##### Q5) What product sold the most? Why do you think it sold the most?
# In[19]:
all_data.head()
# In[30]:
product_group = all_data.groupby('Product')
quantity_ordered = product_group.sum()['Quantity Ordered']
products = [product for product, df in product_group]
plt.bar(products, quantity_ordered, color = 'darkmagenta')
plt.xticks(products, rotation = "vertical", size='8')
plt.xlabel('Products Sold')
plt.ylabel('Ordered Quantity')
plt.show()
# In[50]:
prices = all_data.groupby('Product').mean()['Price Each']
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.bar(products, quantity_ordered, color='green')
ax2.plot(products, prices, 'b-', color = 'blue')
ax1.set_xlabel('Product Sold')
ax1.set_ylabel('Quantity Ordered', color='g')
ax2.set_ylabel('Price in USD ($)', color='b')
ax1.set_xticklabels(products, rotation = "vertical", size='8')
plt.show()
# ###### Answer: i) According to our hypothesis, if the quantity ordered is high, the price should be low (which means AAA battery
# ###### pack is sold the most because it is cheap.) As observed from the graph above, LG dryer and LG washing machine are
# ###### sold in the least amount because of the high price.
# ###### ii) AAA battery pack have a wide scale application and can be used in multiple different products, which makes it more sellable.
# In[ ]:
#FIN
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment