Skip to content

Instantly share code, notes, and snippets.

@khannasarthak
Last active December 18, 2018 19:10
Show Gist options
  • Save khannasarthak/ab8ca4a21dec69da892cb5ff49477b20 to your computer and use it in GitHub Desktop.
Save khannasarthak/ab8ca4a21dec69da892cb5ff49477b20 to your computer and use it in GitHub Desktop.
scripts for market dial
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format # Setting float precision
# Reading CSV's and parsing dates
sales = pd.read_csv('take_home_sales.csv', sep='|',index_col=False,parse_dates=['timestamp'])
storeStatus = pd.read_csv('take_home_store_status.csv', sep='|',index_col=False)
stores = pd.read_csv('take_home_stores.csv', sep='|',index_col=False)
def cleanCSV():
# Product ID cleaning: Removing IDs with not alpha numeric characters
product_idCleaning = sales[sales.product_id.str.isalnum()] #filtering without alnum
#Remove the closed stores from all stores
toDeleteStores = stores[stores.tags == 'ZF'] # List of store to delete
cleaneddf= product_idCleaning[~product_idCleaning['store_id'].isin(toDeleteStores['store_id'])] #Deleted stores
return cleaneddf
def sumCSV(cleaneddf):
# Removing '-' from some rows in revenue column, assuming that the '-' at the end was an error.
# Summing the revenue after removing '-'
cleaneddf = cleaneddf.copy() # To avoid the SettingWithCopyWarning issues
cleaneddf['revenue']=cleaneddf['revenue'].astype(str).str.rstrip('-')
# Converting revenue column to float to sum.
cleaneddf.revenue = cleaneddf.revenue.astype(float)
# Calculating total
total = cleaneddf['revenue'].sum()
print ('*** TOTAL SUM: ',total)
# Store level total
storeLevelSum = cleaneddf.groupby(['store_id'])['revenue'].sum().reset_index(name='store_sum') # grouping by store_id
print ('*** STORE LEVEL SUMS (First 5 stores, to see all, refer generated csv.')
print (storeLevelSum.head())
print ('TOTAL STORE LEVEL SUM: ',storeLevelSum.store_sum.sum())
# Writing store level sum to a CSV
storeLevelSum.to_csv('storeLevelSum.csv',index = False)
# Date Level Total
dateLevelSum = (cleaneddf.reset_index().set_index("timestamp").groupby([pd.Grouper(freq='W-THU',closed='left',label='left')])["revenue"].sum().reset_index())
# Reference 1: https://stackoverflow.com/questions/46562401/group-python-pandas-dataframe-per-weeks-starting-on-monday
# http://pandas.pydata.org/pandas-docs/stable/timeseries.html#anchored-offsets
# label: includes the week on the start, left includes the starting week, right includes the ending week
# closed: whether to include given day or not. left includes, right doesnt
dateLevelSum['timestamp']= dateLevelSum['timestamp'].dt.strftime('%m/%d/%Y') # Converint to date format as asked
print ('*** DATE LEVEL SUMS')
print (dateLevelSum)
print ('TOTAL DATE LEVEL SUM: ',dateLevelSum.revenue.sum())
# Writing date level sum to a CSV
dateLevelSum.to_csv('dateLevelSum.csv',index = False)
sumCSV(cleanCSV())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment