Created
December 22, 2016 19:36
-
-
Save rafaismyname/1579f247fe113a7d24e6dd034aa7b580 to your computer and use it in GitHub Desktop.
Find most common sentences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import sys | |
from csv import reader as csv_reader | |
from string import punctuation | |
from nltk import sent_tokenize, word_tokenize | |
from nltk.util import ngrams | |
from collections import Counter | |
# init config vars | |
lang = "portuguese" | |
encode = "utf8" | |
csv_file_path = "tickets.csv" | |
csv_delimiter = "," | |
csv_phrase_index = 0 | |
csv_skip_header = True | |
ngram_base_length = 3 # numbers of words that compose a sentence | |
reduce_ngram = False | |
increase_ngram = False | |
acceptable_frequency = 5 | |
# consts | |
phrase_counter = Counter() | |
# set default encoding | |
reload(sys) | |
sys.setdefaultencoding(encode) | |
def untokenize(the_ngram): | |
tokens = list(the_ngram) | |
return "".join([" " + i if (i not in punctuation) else i for i in tokens]).strip() | |
def extract_phrases(text, length): | |
for sent in sent_tokenize(text, lang): | |
words = word_tokenize(sent, lang) | |
for phrase in ngrams(words, length): | |
if all(word not in punctuation for word in phrase): | |
phrase_counter[untokenize(phrase)] += 1 | |
def add_sentence(sentence): | |
extract_phrases(sentence.lower(), ngram_base_length) | |
if reduce_ngram: | |
extract_phrases(sentence, ngram_base_length - 1) | |
if increase_ngram: | |
extract_phrases(sentence, ngram_base_length + 1) | |
with open(csv_file_path, "r") as csv_buffer: | |
reader = csv_reader(csv_buffer, delimiter=csv_delimiter) | |
if csv_skip_header: | |
reader.next() | |
[add_sentence(line[csv_phrase_index].lower()) for line in reader] | |
for k,v in phrase_counter.most_common(): | |
if v >= acceptable_frequency: | |
print '{0: <5}'.format(v), k |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment