Created
April 3, 2016 20:40
-
-
Save jrjames83/30e0754d80b0e0e4a60b4a119adf8f80 to your computer and use it in GitHub Desktop.
getting term frequencies from csv file using python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv | |
| from stop_words import get_stop_words | |
| from collections import Counter | |
| #Get the file into a list | |
| with open('taw_quer.csv', 'rb') as f: | |
| reader = csv.reader(f) | |
| data = list(reader) | |
| """ | |
| Since each row is a word, flatten each row and insert all elems | |
| into a master list. Likely a more elegant approach | |
| """ | |
| flat1 = [] | |
| flat2 = [] | |
| final = [] | |
| for x in data: | |
| for y in x: | |
| flat1.append(y) | |
| for each in flat1: | |
| flat2.append(each.split()) | |
| for each in flat2: | |
| for x in each: | |
| final.append(x) | |
| """ | |
| lower case the terms and eliminate stop words | |
| """ | |
| lower = [x.lower() for x in final] | |
| stop_words = get_stop_words('english') | |
| no_stop = [x for x in final if x not in stop_words] | |
| """ | |
| Get Term Frequencies using a Counter object from | |
| the collections lib | |
| """ | |
| c = Counter(no_stop) | |
| for x in c.most_common(50): | |
| print x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment