Last active
April 17, 2018 12:02
-
-
Save smsubrahmannian/b100a92a1e1003d46d13ef03358888d8 to your computer and use it in GitHub Desktop.
LDA in 3 Simple Steps
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim import models,corpora | |
import spacy | |
nlp = spacy.load('en') | |
data = pd.read_feather('data/preprocessed_data') | |
""" Step-1: clean up your text and generate list of words for each document. | |
I recommend you go through an introductory tutorial on Spacy in this link. | |
The content inside the cleanup function is designed for a specific action. | |
I have provided two examples in the github repo """ | |
def clean_up(text): | |
removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE'] | |
text_out = [] | |
doc= nlp(text) | |
for token in doc: | |
if token.is_stop == False and token.is_alpha and len(token)>2 and | |
token.pos_ not in removal: | |
lemma = token.lemma_ | |
text_out.append(lemma) | |
return text_out | |
datalist = data.text.apply(lambda x:clean_up(x)) | |
"""Step-2: Create a vocabulary for the lda model and | |
convert our corpus into document-term matrix for Lda""" | |
dictionary = corpora.Dictionary(datalist) | |
doc_term_matrix = [dictionary.doc2bow(doc) for doc in datalist] | |
# Step-3 : Define multicore lda model and enjoy!!! | |
num_topic =12 | |
Lda = models.LdaMulticore | |
lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary, | |
passes=20,chunksize=2000,random_state=3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment