smsubrahmannian · October 4, 2018 09:54
diff --git a/Preprocessing with Spacy b/Preprocessing with Spacy
 import spacy

 nlp = spacy.load('en') # loading the language model 
 data = pd.read_feather('data/preprocessed_data') # reading a pandas dataframe which is stored as a feather file

 def clean_up(text):  # clean up your text and generate list of words for each document. 
    removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    text_out = []
    doc= nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal:
            lemma = token.lemma_
            text_out.append(lemma)
    return text_out
    
 datalist = data.text.apply(lambda x:clean_up(x))

 # Create a vocabulary for the lda model and 
 # convert our corpus into document-term matrix for Lda
 dictionary = corpora.Dictionary(dataList) 
 doc_term_matrix = [dictionary.doc2bow(doc) for doc in skillList]
	import spacy

	nlp = spacy.load('en') # loading the language model
	data = pd.read_feather('data/preprocessed_data') # reading a pandas dataframe which is stored as a feather file

	def clean_up(text): # clean up your text and generate list of words for each document.
	removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
	text_out = []
	doc= nlp(text)
	for token in doc:
	if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal:
	lemma = token.lemma_
	text_out.append(lemma)
	return text_out

	datalist = data.text.apply(lambda x:clean_up(x))

	# Create a vocabulary for the lda model and
	# convert our corpus into document-term matrix for Lda
	dictionary = corpora.Dictionary(dataList)
	doc_term_matrix = [dictionary.doc2bow(doc) for doc in skillList]