Last active
February 2, 2021 21:22
-
-
Save sjtalkar/8183d0bae645d8dbff344e43303671a3 to your computer and use it in GitHub Desktop.
Amenities analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def getAmenitiesTokens(cell_val): | |
return regexp_tokenize(cell_val, "([\w\s\d\']+), ") | |
#Clear out unicode and unnecessary square brackets | |
amenities_df['amenities'] = amenities_df['amenities'].replace({'\[': '', '\]': '', '"':'', r'\\u2019': r"'", r"\\u2013":"-", r"\\u00a0":""}, regex=True) | |
# Follow the patttern! Lower case everything to make items similar | |
amenities_df['amenities'] = amenities_df['amenities'].str.lower() | |
#Apply the tokenizer | |
amenities_df["amenities_tokens"] = amenities_df["amenities"].apply(getAmenitiesTokens ) | |
#Create a frequency dictionary | |
def addToTokenFreqDist(cell_value, tokenDist): | |
tokenDist = tokenDist.update(cell_value) | |
tokenDist = FreqDist() | |
amenities_df ['amenities_tokens'].apply(addToTokenFreqDist, args=(tokenDist, )) | |
#Convert the distribution into a dataframe that can be used to plot a frequency and a word cloud | |
tokenDist_df = pd.DataFrame.from_dict(tokenDist, orient='index') | |
tokenDist_df = tokenDist_df.reset_index() | |
tokenDist_df.columns=['amenity_token', 'freq'] | |
tokenDist_df = tokenDist_df.sort_values(by="freq", ascending=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment