Khalefa · June 13, 2018 15:32
diff --git a/clean1.py b/clean1.py
 import re, string, unicodedata
 import nltk
 #import contractions
 #import inflect
 from bs4 import BeautifulSoup
 from nltk import word_tokenize, sent_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import LancasterStemmer, WordNetLemmatizer
 def remove_non_ascii(word):
    """Remove non-ASCII characters from list of tokenized words"""
    return  unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    


 def remove_punctuation(word):
    """Remove punctuation from list of tokenized words"""
    return  re.sub(r'[^\w\s]', '', word)
        

 def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

 def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    if word not in stopwords.words('english'):
            return word
    return ""

 def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

 def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas



 def normalize(word):
    #may be it is better to check if the word contains non ascii chars first

    word = remove_non_ascii(word)
    word = remove_punctuation(word)
    #word = remove_stopwords(word)
    return word




 with open("/media/hossein/DATA1/Tables/prep_column.txt", 'w') as output:
    with open("/media/hossein/DATA1/Tables/firstcolumn.csv",'r') as inp:
        for line in inp:
        # split into words by white space

            terms = line.split(',')
            words=[]
            for term in terms:
                words=term.tolower().split(" "):
                nwords=[normalize(word) for word in words if word in stopwords.words('english')]
                                                                                    
                if len(words) >1:

                    nwords.append('_'.join(nwords)) 
                                    
                                    
                output.write(" ".join(nwords))
	import re, string, unicodedata
	import nltk
	#import contractions
	#import inflect
	from bs4 import BeautifulSoup
	from nltk import word_tokenize, sent_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import LancasterStemmer, WordNetLemmatizer
	def remove_non_ascii(word):
	"""Remove non-ASCII characters from list of tokenized words"""
	return unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')



	def remove_punctuation(word):
	"""Remove punctuation from list of tokenized words"""
	return re.sub(r'[^\w\s]', '', word)


	def replace_numbers(words):
	"""Replace all interger occurrences in list of tokenized words with textual representation"""
	p = inflect.engine()
	new_words = []
	for word in words:
	if word.isdigit():
	new_word = p.number_to_words(word)
	new_words.append(new_word)
	else:
	new_words.append(word)
	return new_words

	def remove_stopwords(words):
	"""Remove stop words from list of tokenized words"""
	if word not in stopwords.words('english'):
	return word
	return ""

	def stem_words(words):
	"""Stem words in list of tokenized words"""
	stemmer = LancasterStemmer()
	stems = []
	for word in words:
	stem = stemmer.stem(word)
	stems.append(stem)
	return stems

	def lemmatize_verbs(words):
	"""Lemmatize verbs in list of tokenized words"""
	lemmatizer = WordNetLemmatizer()
	lemmas = []
	for word in words:
	lemma = lemmatizer.lemmatize(word, pos='v')
	lemmas.append(lemma)
	return lemmas



	def normalize(word):
	#may be it is better to check if the word contains non ascii chars first

	word = remove_non_ascii(word)
	word = remove_punctuation(word)
	#word = remove_stopwords(word)
	return word




	with open("/media/hossein/DATA1/Tables/prep_column.txt", 'w') as output:
	with open("/media/hossein/DATA1/Tables/firstcolumn.csv",'r') as inp:
	for line in inp:
	# split into words by white space

	terms = line.split(',')
	words=[]
	for term in terms:
	words=term.tolower().split(" "):
	nwords=[normalize(word) for word in words if word in stopwords.words('english')]

	if len(words) >1:

	nwords.append('_'.join(nwords))


	output.write(" ".join(nwords))
No results found