Created
June 30, 2019 15:35
-
-
Save joshua-taylor/3d1e7b3bff167ca28bcd7f70987e16b2 to your computer and use it in GitHub Desktop.
Ngrams.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ngrams(string, n=3): | |
string = fix_text(string) # fix text encoding issues | |
string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars | |
string = string.lower() #make lower case | |
chars_to_remove = [")","(",".","|","[","]","{","}","'"] | |
rx = '[' + re.escape(''.join(chars_to_remove)) + ']' | |
string = re.sub(rx, '', string) #remove the list of chars defined above | |
string = string.replace('&', 'and') | |
string = string.replace(',', ' ') | |
string = string.replace('-', ' ') | |
string = string.title() # normalise case - capital at start of each word | |
string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single space | |
string = ' '+ string +' ' # pad names for ngrams... | |
string = re.sub(r'[,-./]|\sBD',r'', string) | |
ngrams = zip(*[string[i:] for i in range(n)]) | |
return [''.join(ngram) for ngram in ngrams] |
What's the 're' in this script?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi - this relates to cleaning up encoding issues using https://ftfy.readthedocs.io/en/latest/