Skip to content

Instantly share code, notes, and snippets.

@caseykulm
Last active July 16, 2019 16:40
Show Gist options
  • Save caseykulm/b4b50701ef0f117791690a3c50f0c9ab to your computer and use it in GitHub Desktop.
Save caseykulm/b4b50701ef0f117791690a3c50f0c9ab to your computer and use it in GitHub Desktop.
Find the most common n-grams in an input string
# Adapted from this SO post https://stackoverflow.com/a/14670769/1229735
from collections import Counter
counter = Counter()
text = '' # Put your text here. It helps to prune stuff from the input that you consider noise.
min_substring_length = 7
max_substring_length = 15 # len(text) + 1 # for max length
def firstCharacterIsUpper(input):
return input[:1].isupper()
def upperCaseLimit(input, min, max):
count = 0
for char in input:
if (char.isupper()):
count += 1
return min <= count and count <= max
def containsNoSpaces(input):
return ' ' not in input
# end = rom min_substring_length to end
for length in range(min_substring_length, max_substring_length):
end = len(text) - length
# start = From 0 to end - length
for start in range(end):
current_end = start + length
ngram = text[start:current_end]
# print("start: {0}, end: {1}, ngram: {2}".format(start, current_end, ngram))
# if (firstCharacterIsUpper(ngram) and containsNoSpaces(ngram) and upperCaseLimit(ngram, 1, 2)):
counter[ngram] += 1
print(Counter(counter).most_common(100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment