ShambhaviPataskar · September 8, 2021 14:50
diff --git a/Analysis.py b/Analysis.py
 #!/usr/bin/env python
 # coding: utf-8

 # # Sales Analysis

 # ### import necessary libraries

 # In[1]:


 import pandas as pd
 import os


 # #### Task #1: merging 12 months of sales data into a single file

 # In[2]:


 df = pd.read_csv("./Sales_Data/Sales_April_2019.csv")

 files = [file for file in os.listdir('./Sales_Data')]

 all_months_data = pd.DataFrame()

 for file in files:
    
    df = pd.read_csv("./Sales_Data/"+file)
    all_months_data = pd.concat([all_months_data, df])
    
    
    
    all_months_data.head()
    print(all_months_data)
    
 all_months_data.to_csv("all_data.csv", index = False)


 # # Read in the updated dataframe

 # all_data.head(100)

 # In[3]:


 all_data = pd.read_csv("all_data.csv")
 all_data.head()


 # ### Clean up the data

 # #### Drop rows of NaN

 # In[4]:


 nan_df = all_data[all_data.isna().any(axis=1)]
 nan_df.head()

 all_data = all_data.dropna(how = 'all')
 all_data.head()


 # #### Find 'Or' and delete it

 # In[5]:


 all_data = all_data[all_data['Order Date'].str[0:2] != 'Or']


 # ##### Convert columns to the correct type

 # In[6]:


 all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered'])# Make int
 all_data['Price Each'] = pd.to_numeric(all_data['Price Each']) # Make float

 all_data.head()


 # # Agument data with additional columns

 # #### Task #2: Add month column

 # In[7]:


 all_data['Month'] = all_data["Order Date"].str[0:2]
 all_data['Month'] = all_data['Month'].astype('int32')
 all_data.head()


 # #### Task #3: Add a Sales column

 # In[8]:


 all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']
 all_data.head()


 # #### Add a city column

 # In[9]:


 # Let's use .apply()
 def get_city(address):
    return address.split(',')[1]

 def get_state(address):
    return address.split(',')[2].split(' ')[1]

 all_data['City'] = all_data['Purchase Address'].apply(lambda x: f"{get_city(x)} ({get_state(x)})")

 all_data.head()


 # ##### Q1) What was the best month for sales? How much was earned that month?

 # In[38]:


 results = all_data.groupby('Month').sum()


 # In[39]:


 import matplotlib.pyplot as plt

 months = range(1, 13)

 plt.bar(months, results['Sales'], color = 'crimson')

 plt.xticks(months)
 plt.ylabel('Sales in USD ($)')
 plt.xlabel('Month number')

 plt.show()

 #December was the best month for sales


 # ##### Q2) What city had the highest number of sales?

 # In[41]:


 results = all_data.groupby('City').sum()
 results


 # In[42]:


 import matplotlib.pyplot as plt

 cities = [city for city, df in all_data.groupby('City')]

 plt.bar(cities, results['Sales'], color = 'dodgerblue')

 plt.xticks(cities, rotation = "vertical", size='8')
 plt.ylabel('Sales in USD ($)')
 plt.xlabel('City Name')

 plt.show()

 #Answer: San Francisco(CA)


 # ##### Q3) What time should we display advertisements to maximize likelihood of customers buying product?

 # In[14]:


 all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
 all_data['Hour'] = all_data['Order Date'].dt.hour
 all_data['Minute'] = all_data['Order Date'].dt.minute


 # In[15]:


 all_data.head()


 # In[25]:


 hours = [hour for hour, df in all_data.groupby('Hour')]

 plt.plot(hours, all_data.groupby(['Hour']).count(), color = 'darkorange')
 plt.xticks(hours)
 plt.grid()
 plt.xlabel("Time (in hours)")
 plt.ylabel('Number of Orders')

 all_data.groupby(['Hour']).count()
 plt.show()

 #Answer: Around 12pm (12) and/or 7pm (19)


 # ##### Q4) What products are most often sold together?

 # In[43]:


 df = all_data[all_data['Order ID'].duplicated(keep = False)]
 df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ',' .join(x))
 df = df[['Order ID', 'Grouped']].drop_duplicates()

 df.head()


 # In[18]:


 from itertools import combinations
 from collections import Counter

 count = Counter()

 for row in df['Grouped']:
    row_list = row.split(',')
    count.update(Counter(combinations(row_list, 3)))
    
 for key, value in count.most_common(10):
    print(key, value)


 # ##### Q5) What product sold the most? Why do you think it sold the most?

 # In[19]:


 all_data.head()


 # In[30]:


 product_group = all_data.groupby('Product')

 quantity_ordered = product_group.sum()['Quantity Ordered']

 products = [product for product, df in product_group]
 plt.bar(products, quantity_ordered, color = 'darkmagenta')

 plt.xticks(products, rotation = "vertical", size='8')
 plt.xlabel('Products Sold')
 plt.ylabel('Ordered Quantity')

 plt.show()


 # In[50]:


 prices = all_data.groupby('Product').mean()['Price Each']


 fig, ax1 = plt.subplots()

 ax2 = ax1.twinx()
 ax1.bar(products, quantity_ordered, color='green')
 ax2.plot(products, prices, 'b-', color = 'blue')

 ax1.set_xlabel('Product Sold')
 ax1.set_ylabel('Quantity Ordered', color='g')
 ax2.set_ylabel('Price in USD ($)', color='b')
 ax1.set_xticklabels(products, rotation = "vertical", size='8')


 plt.show()


 # ###### Answer: i) According to our hypothesis, if the quantity ordered is high, the price should be low (which means AAA battery
 # ###### pack is sold the most because it is cheap.) As observed from the graph above, LG dryer and LG washing machine are
 # ###### sold in the least amount because of the high price. 
 # ######              ii) AAA battery pack have a wide scale application and can be used in multiple different products, which makes it more sellable.

 # In[ ]:


 #FIN
	#!/usr/bin/env python
	# coding: utf-8

	# # Sales Analysis

	# ### import necessary libraries

	# In[1]:


	import pandas as pd
	import os


	# #### Task #1: merging 12 months of sales data into a single file

	# In[2]:


	df = pd.read_csv("./Sales_Data/Sales_April_2019.csv")

	files = [file for file in os.listdir('./Sales_Data')]

	all_months_data = pd.DataFrame()

	for file in files:

	df = pd.read_csv("./Sales_Data/"+file)
	all_months_data = pd.concat([all_months_data, df])



	all_months_data.head()
	print(all_months_data)

	all_months_data.to_csv("all_data.csv", index = False)


	# # Read in the updated dataframe

	# all_data.head(100)

	# In[3]:


	all_data = pd.read_csv("all_data.csv")
	all_data.head()


	# ### Clean up the data

	# #### Drop rows of NaN

	# In[4]:


	nan_df = all_data[all_data.isna().any(axis=1)]
	nan_df.head()

	all_data = all_data.dropna(how = 'all')
	all_data.head()


	# #### Find 'Or' and delete it

	# In[5]:


	all_data = all_data[all_data['Order Date'].str[0:2] != 'Or']


	# ##### Convert columns to the correct type

	# In[6]:


	all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered'])# Make int
	all_data['Price Each'] = pd.to_numeric(all_data['Price Each']) # Make float

	all_data.head()


	# # Agument data with additional columns

	# #### Task #2: Add month column

	# In[7]:


	all_data['Month'] = all_data["Order Date"].str[0:2]
	all_data['Month'] = all_data['Month'].astype('int32')
	all_data.head()


	# #### Task #3: Add a Sales column

	# In[8]:


	all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']
	all_data.head()


	# #### Add a city column

	# In[9]:


	# Let's use .apply()
	def get_city(address):
	return address.split(',')[1]

	def get_state(address):
	return address.split(',')[2].split(' ')[1]

	all_data['City'] = all_data['Purchase Address'].apply(lambda x: f"{get_city(x)} ({get_state(x)})")

	all_data.head()


	# ##### Q1) What was the best month for sales? How much was earned that month?

	# In[38]:


	results = all_data.groupby('Month').sum()


	# In[39]:


	import matplotlib.pyplot as plt

	months = range(1, 13)

	plt.bar(months, results['Sales'], color = 'crimson')

	plt.xticks(months)
	plt.ylabel('Sales in USD ($)')
	plt.xlabel('Month number')

	plt.show()

	#December was the best month for sales


	# ##### Q2) What city had the highest number of sales?

	# In[41]:


	results = all_data.groupby('City').sum()
	results


	# In[42]:


	import matplotlib.pyplot as plt

	cities = [city for city, df in all_data.groupby('City')]

	plt.bar(cities, results['Sales'], color = 'dodgerblue')

	plt.xticks(cities, rotation = "vertical", size='8')
	plt.ylabel('Sales in USD ($)')
	plt.xlabel('City Name')

	plt.show()

	#Answer: San Francisco(CA)


	# ##### Q3) What time should we display advertisements to maximize likelihood of customers buying product?

	# In[14]:


	all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
	all_data['Hour'] = all_data['Order Date'].dt.hour
	all_data['Minute'] = all_data['Order Date'].dt.minute


	# In[15]:


	all_data.head()


	# In[25]:


	hours = [hour for hour, df in all_data.groupby('Hour')]

	plt.plot(hours, all_data.groupby(['Hour']).count(), color = 'darkorange')
	plt.xticks(hours)
	plt.grid()
	plt.xlabel("Time (in hours)")
	plt.ylabel('Number of Orders')

	all_data.groupby(['Hour']).count()
	plt.show()

	#Answer: Around 12pm (12) and/or 7pm (19)


	# ##### Q4) What products are most often sold together?

	# In[43]:


	df = all_data[all_data['Order ID'].duplicated(keep = False)]
	df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ',' .join(x))
	df = df[['Order ID', 'Grouped']].drop_duplicates()

	df.head()


	# In[18]:


	from itertools import combinations
	from collections import Counter

	count = Counter()

	for row in df['Grouped']:
	row_list = row.split(',')
	count.update(Counter(combinations(row_list, 3)))

	for key, value in count.most_common(10):
	print(key, value)


	# ##### Q5) What product sold the most? Why do you think it sold the most?

	# In[19]:


	all_data.head()


	# In[30]:


	product_group = all_data.groupby('Product')

	quantity_ordered = product_group.sum()['Quantity Ordered']

	products = [product for product, df in product_group]
	plt.bar(products, quantity_ordered, color = 'darkmagenta')

	plt.xticks(products, rotation = "vertical", size='8')
	plt.xlabel('Products Sold')
	plt.ylabel('Ordered Quantity')

	plt.show()


	# In[50]:


	prices = all_data.groupby('Product').mean()['Price Each']


	fig, ax1 = plt.subplots()

	ax2 = ax1.twinx()
	ax1.bar(products, quantity_ordered, color='green')
	ax2.plot(products, prices, 'b-', color = 'blue')

	ax1.set_xlabel('Product Sold')
	ax1.set_ylabel('Quantity Ordered', color='g')
	ax2.set_ylabel('Price in USD ($)', color='b')
	ax1.set_xticklabels(products, rotation = "vertical", size='8')


	plt.show()


	# ###### Answer: i) According to our hypothesis, if the quantity ordered is high, the price should be low (which means AAA battery
	# ###### pack is sold the most because it is cheap.) As observed from the graph above, LG dryer and LG washing machine are
	# ###### sold in the least amount because of the high price.
	# ###### ii) AAA battery pack have a wide scale application and can be used in multiple different products, which makes it more sellable.

	# In[ ]:


	#FIN