Skip to content

Instantly share code, notes, and snippets.

import IPython
tkn = tfidf.build_tokenizer()
sent = df.questionText.values[236178].lower()
sent = tkn(sent)
html=''
for wrd in sent:
try:
weight = (tfidf.idf_[tfidf.vocabulary_[wrd]])*10
print(weight/10)
except:
def ngrams(string, n=3):
string = fix_text(string) # fix text encoding issues
string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
string = string.lower() #make lower case
chars_to_remove = [")","(",".","|","[","]","{","}","'"]
rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
string = re.sub(rx, '', string) #remove the list of chars defined above
string = string.replace('&', 'and')
string = string.replace(',', ' ')
string = string.replace('-', ' ')