Skip to content

Instantly share code, notes, and snippets.

@PandaWhoCodes
Created January 27, 2020 18:29
Show Gist options
  • Save PandaWhoCodes/b326bf8decf6d0436e0439069b7ef3ff to your computer and use it in GitHub Desktop.
Save PandaWhoCodes/b326bf8decf6d0436e0439069b7ef3ff to your computer and use it in GitHub Desktop.
import csv
from nltk import ngrams
import sys
import collections
import string
translator=str.maketrans('','',string.punctuation)
def to_string(list):
"""
converts a list into string
:return: String
"""
s = ""
for str in list:
s += str + "\n"
return s
def get_stop_list():
with open("SmartStoplist.txt") as f:
return f.read().split("\n")
def removeStopwords(wordlist):
"""
removes stop words from the string
:param wordlist: list of words
:param stopwords: list of stop words - SET()
"""
stopwords = set(get_stop_list())
return [w for w in wordlist if w not in stopwords]
def get_text(filename):
"""
Extract text from the CSV file
"""
all_text = ""
with open(filename, 'r', encoding="utf8") as f:
reader = csv.reader(f)
for row in reader:
all_text = all_text + row[0].lower() + "\n"
return all_text
def ngram(text_list, n):
"""
Perform n-grams
return: Returns a generator containing sets of ngrams
"""
return ngrams(text_list, n)
def count_frequency(grams):
"""
takes the n_grams and performs frequency check
"""
all_grams = []
for gram in grams:
all_grams.append(" ".join(gram))
return collections.Counter(all_grams)
def handle_grams(filename):
"""
Calls all the other functions and writes the ngrams output to csv file with their frequency
"""
text = removeStopwords(get_text(filename).translate(translator).split())
# text = text
filename = "ngrams_" + filename
csvFile = open(filename, 'w', newline='', encoding='utf-8')
csvWriter = csv.writer(csvFile)
for i in range(1, 4):
a = count_frequency(ngram(text, i))
for items in a.most_common(20):
csvWriter.writerow(list(items))
# handle_grams(filename="#CES.csv")
if __name__ == '__main__':
filename = sys.argv[1]
handle_grams(filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment