Skip to content

Instantly share code, notes, and snippets.

@sjtalkar
Last active February 2, 2021 21:22
Show Gist options
  • Save sjtalkar/8183d0bae645d8dbff344e43303671a3 to your computer and use it in GitHub Desktop.
Save sjtalkar/8183d0bae645d8dbff344e43303671a3 to your computer and use it in GitHub Desktop.
Amenities analysis
def getAmenitiesTokens(cell_val):
return regexp_tokenize(cell_val, "([\w\s\d\']+), ")
#Clear out unicode and unnecessary square brackets
amenities_df['amenities'] = amenities_df['amenities'].replace({'\[': '', '\]': '', '"':'', r'\\u2019': r"'", r"\\u2013":"-", r"\\u00a0":""}, regex=True)
# Follow the patttern! Lower case everything to make items similar
amenities_df['amenities'] = amenities_df['amenities'].str.lower()
#Apply the tokenizer
amenities_df["amenities_tokens"] = amenities_df["amenities"].apply(getAmenitiesTokens )
#Create a frequency dictionary
def addToTokenFreqDist(cell_value, tokenDist):
tokenDist = tokenDist.update(cell_value)
tokenDist = FreqDist()
amenities_df ['amenities_tokens'].apply(addToTokenFreqDist, args=(tokenDist, ))
#Convert the distribution into a dataframe that can be used to plot a frequency and a word cloud
tokenDist_df = pd.DataFrame.from_dict(tokenDist, orient='index')
tokenDist_df = tokenDist_df.reset_index()
tokenDist_df.columns=['amenity_token', 'freq']
tokenDist_df = tokenDist_df.sort_values(by="freq", ascending=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment