Skip to content

Instantly share code, notes, and snippets.

@Khalefa
Created June 13, 2018 15:32
Show Gist options
  • Select an option

  • Save Khalefa/189f21470fd5167ea5b7f6adb5a90adb to your computer and use it in GitHub Desktop.

Select an option

Save Khalefa/189f21470fd5167ea5b7f6adb5a90adb to your computer and use it in GitHub Desktop.
import re, string, unicodedata
import nltk
#import contractions
#import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
def remove_non_ascii(word):
"""Remove non-ASCII characters from list of tokenized words"""
return unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
def remove_punctuation(word):
"""Remove punctuation from list of tokenized words"""
return re.sub(r'[^\w\s]', '', word)
def replace_numbers(words):
"""Replace all interger occurrences in list of tokenized words with textual representation"""
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
if word not in stopwords.words('english'):
return word
return ""
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
stems = []
for word in words:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def lemmatize_verbs(words):
"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()
lemmas = []
for word in words:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma)
return lemmas
def normalize(word):
#may be it is better to check if the word contains non ascii chars first
word = remove_non_ascii(word)
word = remove_punctuation(word)
#word = remove_stopwords(word)
return word
with open("/media/hossein/DATA1/Tables/prep_column.txt", 'w') as output:
with open("/media/hossein/DATA1/Tables/firstcolumn.csv",'r') as inp:
for line in inp:
# split into words by white space
terms = line.split(',')
words=[]
for term in terms:
words=term.tolower().split(" "):
nwords=[normalize(word) for word in words if word in stopwords.words('english')]
if len(words) >1:
nwords.append('_'.join(nwords))
output.write(" ".join(nwords))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment