Last active
October 4, 2018 09:54
-
-
Save smsubrahmannian/2835bd32c688b7b57a5300f94af07b1b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
nlp = spacy.load('en') # loading the language model | |
data = pd.read_feather('data/preprocessed_data') # reading a pandas dataframe which is stored as a feather file | |
def clean_up(text): # clean up your text and generate list of words for each document. | |
removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE'] | |
text_out = [] | |
doc= nlp(text) | |
for token in doc: | |
if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal: | |
lemma = token.lemma_ | |
text_out.append(lemma) | |
return text_out | |
datalist = data.text.apply(lambda x:clean_up(x)) | |
# Create a vocabulary for the lda model and | |
# convert our corpus into document-term matrix for Lda | |
dictionary = corpora.Dictionary(dataList) | |
doc_term_matrix = [dictionary.doc2bow(doc) for doc in skillList] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment