Created
March 24, 2018 01:56
-
-
Save breeko/4b949098177862de11b1d2348f7fbdb9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| pd.set_option("precision", 2) | |
| pd.options.display.float_format = '{:20,.2f}'.format | |
| df = pd.read_csv("Restaurant_Grades.csv") | |
| # Replace spaces with underscores | |
| df.columns = df.columns.str.replace("CUISINE DESCRIPTION","CUISINE") | |
| df.columns = df.columns.str.replace(" ","_") | |
| # Clean up some of the names | |
| df["DBA"].replace("'","", inplace=True) # Remove apostrophe | |
| df["DBA"].replace(" ?\(.+\)", "", regex=True, inplace=True) # Remove values in parenthesis | |
| df["DBA"].replace(" ?#.*", "", regex=True, inplace=True) # Remove # followed by some string | |
| # Convert dates to datetime | |
| df.GRADE_DATE = pd.to_datetime(df.GRADE_DATE, format="%m/%d/%Y") | |
| df.RECORD_DATE = pd.to_datetime(df.RECORD_DATE, format="%m/%d/%Y") | |
| # Create a unique key based on restaurant | |
| df["KEY"] = df[['DBA', 'BUILDING', "STREET", "ZIPCODE"]].astype(str).apply(lambda x: ' '.join(x), axis=1) | |
| print("num ratings: {} num unique restaurants: {}".format(len(df), len(df.KEY.unique()))) | |
| # num ratings: 186185 num unique restaurants: 24607 | |
| # Scores are broken down by critical, not critical and not applicable, which results in duplicate rows for the same rating | |
| df = df[["KEY","DBA","BORO","CUISINE","SCORE","GRADE","GRADE_DATE","RECORD_DATE"]].drop_duplicates() | |
| df = df.sort_values(["KEY","GRADE_DATE"],ascending=[True,False]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment