Created
May 13, 2020 12:18
-
-
Save gauravbansal98/4403abb892e7f7a7d17d0e11ac66527d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import string | |
| def clean_descriptions(descriptions): | |
| # prepare translation table for removing punctuation | |
| table = str.maketrans('', '', string.punctuation) | |
| for key, desc_list in descriptions.items(): | |
| for i in range(len(desc_list)): | |
| desc = desc_list[i] | |
| # tokenize | |
| desc = desc.split() | |
| # convert to lower case | |
| desc = [word.lower() for word in desc] | |
| # remove punctuation from each token | |
| desc = [w.translate(table) for w in desc] | |
| # remove hanging 's' and 'a' | |
| desc = [word for word in desc if len(word)>1] | |
| # remove tokens with numbers in them | |
| desc = [word for word in desc if word.isalpha()] | |
| # store as string | |
| desc_list[i] = ' '.join(desc) | |
| # clean descriptions | |
| clean_descriptions(descriptions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment