Created
January 27, 2020 18:29
-
-
Save PandaWhoCodes/b326bf8decf6d0436e0439069b7ef3ff to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from nltk import ngrams | |
import sys | |
import collections | |
import string | |
translator=str.maketrans('','',string.punctuation) | |
def to_string(list): | |
""" | |
converts a list into string | |
:return: String | |
""" | |
s = "" | |
for str in list: | |
s += str + "\n" | |
return s | |
def get_stop_list(): | |
with open("SmartStoplist.txt") as f: | |
return f.read().split("\n") | |
def removeStopwords(wordlist): | |
""" | |
removes stop words from the string | |
:param wordlist: list of words | |
:param stopwords: list of stop words - SET() | |
""" | |
stopwords = set(get_stop_list()) | |
return [w for w in wordlist if w not in stopwords] | |
def get_text(filename): | |
""" | |
Extract text from the CSV file | |
""" | |
all_text = "" | |
with open(filename, 'r', encoding="utf8") as f: | |
reader = csv.reader(f) | |
for row in reader: | |
all_text = all_text + row[0].lower() + "\n" | |
return all_text | |
def ngram(text_list, n): | |
""" | |
Perform n-grams | |
return: Returns a generator containing sets of ngrams | |
""" | |
return ngrams(text_list, n) | |
def count_frequency(grams): | |
""" | |
takes the n_grams and performs frequency check | |
""" | |
all_grams = [] | |
for gram in grams: | |
all_grams.append(" ".join(gram)) | |
return collections.Counter(all_grams) | |
def handle_grams(filename): | |
""" | |
Calls all the other functions and writes the ngrams output to csv file with their frequency | |
""" | |
text = removeStopwords(get_text(filename).translate(translator).split()) | |
# text = text | |
filename = "ngrams_" + filename | |
csvFile = open(filename, 'w', newline='', encoding='utf-8') | |
csvWriter = csv.writer(csvFile) | |
for i in range(1, 4): | |
a = count_frequency(ngram(text, i)) | |
for items in a.most_common(20): | |
csvWriter.writerow(list(items)) | |
# handle_grams(filename="#CES.csv") | |
if __name__ == '__main__': | |
filename = sys.argv[1] | |
handle_grams(filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment