-
-
Save aguerrave/5f450010d97ab434e05c4aae2aefd87f to your computer and use it in GitHub Desktop.
A sales analysis using jupyter notebook
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# # Sales Analysis | |
# ### import necessary libraries | |
# In[1]: | |
import pandas as pd | |
import os | |
# #### Task #1: merging 12 months of sales data into a single file | |
# In[2]: | |
df = pd.read_csv("./Sales_Data/Sales_April_2019.csv") | |
files = [file for file in os.listdir('./Sales_Data')] | |
all_months_data = pd.DataFrame() | |
for file in files: | |
df = pd.read_csv("./Sales_Data/"+file) | |
all_months_data = pd.concat([all_months_data, df]) | |
all_months_data.head() | |
print(all_months_data) | |
all_months_data.to_csv("all_data.csv", index = False) | |
# # Read in the updated dataframe | |
# all_data.head(100) | |
# In[3]: | |
all_data = pd.read_csv("all_data.csv") | |
all_data.head() | |
# ### Clean up the data | |
# #### Drop rows of NaN | |
# In[4]: | |
nan_df = all_data[all_data.isna().any(axis=1)] | |
nan_df.head() | |
all_data = all_data.dropna(how = 'all') | |
all_data.head() | |
# #### Find 'Or' and delete it | |
# In[5]: | |
all_data = all_data[all_data['Order Date'].str[0:2] != 'Or'] | |
# ##### Convert columns to the correct type | |
# In[6]: | |
all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered'])# Make int | |
all_data['Price Each'] = pd.to_numeric(all_data['Price Each']) # Make float | |
all_data.head() | |
# # Agument data with additional columns | |
# #### Task #2: Add month column | |
# In[7]: | |
all_data['Month'] = all_data["Order Date"].str[0:2] | |
all_data['Month'] = all_data['Month'].astype('int32') | |
all_data.head() | |
# #### Task #3: Add a Sales column | |
# In[8]: | |
all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each'] | |
all_data.head() | |
# #### Add a city column | |
# In[9]: | |
# Let's use .apply() | |
def get_city(address): | |
return address.split(',')[1] | |
def get_state(address): | |
return address.split(',')[2].split(' ')[1] | |
all_data['City'] = all_data['Purchase Address'].apply(lambda x: f"{get_city(x)} ({get_state(x)})") | |
all_data.head() | |
# ##### Q1) What was the best month for sales? How much was earned that month? | |
# In[38]: | |
results = all_data.groupby('Month').sum() | |
# In[39]: | |
import matplotlib.pyplot as plt | |
months = range(1, 13) | |
plt.bar(months, results['Sales'], color = 'crimson') | |
plt.xticks(months) | |
plt.ylabel('Sales in USD ($)') | |
plt.xlabel('Month number') | |
plt.show() | |
#December was the best month for sales | |
# ##### Q2) What city had the highest number of sales? | |
# In[41]: | |
results = all_data.groupby('City').sum() | |
results | |
# In[42]: | |
import matplotlib.pyplot as plt | |
cities = [city for city, df in all_data.groupby('City')] | |
plt.bar(cities, results['Sales'], color = 'dodgerblue') | |
plt.xticks(cities, rotation = "vertical", size='8') | |
plt.ylabel('Sales in USD ($)') | |
plt.xlabel('City Name') | |
plt.show() | |
#Answer: San Francisco(CA) | |
# ##### Q3) What time should we display advertisements to maximize likelihood of customers buying product? | |
# In[14]: | |
all_data['Order Date'] = pd.to_datetime(all_data['Order Date']) | |
all_data['Hour'] = all_data['Order Date'].dt.hour | |
all_data['Minute'] = all_data['Order Date'].dt.minute | |
# In[15]: | |
all_data.head() | |
# In[25]: | |
hours = [hour for hour, df in all_data.groupby('Hour')] | |
plt.plot(hours, all_data.groupby(['Hour']).count(), color = 'darkorange') | |
plt.xticks(hours) | |
plt.grid() | |
plt.xlabel("Time (in hours)") | |
plt.ylabel('Number of Orders') | |
all_data.groupby(['Hour']).count() | |
plt.show() | |
#Answer: Around 12pm (12) and/or 7pm (19) | |
# ##### Q4) What products are most often sold together? | |
# In[43]: | |
df = all_data[all_data['Order ID'].duplicated(keep = False)] | |
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ',' .join(x)) | |
df = df[['Order ID', 'Grouped']].drop_duplicates() | |
df.head() | |
# In[18]: | |
from itertools import combinations | |
from collections import Counter | |
count = Counter() | |
for row in df['Grouped']: | |
row_list = row.split(',') | |
count.update(Counter(combinations(row_list, 3))) | |
for key, value in count.most_common(10): | |
print(key, value) | |
# ##### Q5) What product sold the most? Why do you think it sold the most? | |
# In[19]: | |
all_data.head() | |
# In[30]: | |
product_group = all_data.groupby('Product') | |
quantity_ordered = product_group.sum()['Quantity Ordered'] | |
products = [product for product, df in product_group] | |
plt.bar(products, quantity_ordered, color = 'darkmagenta') | |
plt.xticks(products, rotation = "vertical", size='8') | |
plt.xlabel('Products Sold') | |
plt.ylabel('Ordered Quantity') | |
plt.show() | |
# In[50]: | |
prices = all_data.groupby('Product').mean()['Price Each'] | |
fig, ax1 = plt.subplots() | |
ax2 = ax1.twinx() | |
ax1.bar(products, quantity_ordered, color='green') | |
ax2.plot(products, prices, 'b-', color = 'blue') | |
ax1.set_xlabel('Product Sold') | |
ax1.set_ylabel('Quantity Ordered', color='g') | |
ax2.set_ylabel('Price in USD ($)', color='b') | |
ax1.set_xticklabels(products, rotation = "vertical", size='8') | |
plt.show() | |
# ###### Answer: i) According to our hypothesis, if the quantity ordered is high, the price should be low (which means AAA battery | |
# ###### pack is sold the most because it is cheap.) As observed from the graph above, LG dryer and LG washing machine are | |
# ###### sold in the least amount because of the high price. | |
# ###### ii) AAA battery pack have a wide scale application and can be used in multiple different products, which makes it more sellable. | |
# In[ ]: | |
#FIN | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment