Created
September 11, 2022 21:11
-
-
Save Abhayparashar31/53fd6df353870852dd5ece86c21db763 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def topic_modeling(data): | |
### Tokens | |
tokens = [] | |
for text in data: | |
text = word_tokenize(text) | |
tokens.append(text) | |
### Make Biagrams | |
tokens = make_biagram(data=data,tokens=tokens) | |
### Corpora Dictionary | |
dictionary = corpora.Dictionary(tokens) | |
### Creating Document Term Matrix | |
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokens] | |
### Training The LDA Model | |
lda_model = gensim.models.LdaModel(doc_term_matrix, ## Document Term Matrix | |
num_topics = 5, ## Number of Topics | |
id2word = dictionary, ## Word and Frequency Dictionary | |
passes = 10, ## Number of passes throw the corpus during training (similar to epochs in neural networks) | |
chunksize=10, ## Number of documents to be used in each training chunk | |
update_every=1, ## Number of documents to be iterated through for each update. | |
alpha='auto', ## number of expected topics that expresses | |
per_word_topics=True, | |
random_state=42) | |
### Exploring Common Words For Each Topic With Their Relative Words | |
for idx, topic in lda_model.print_topics(): | |
print("Topic: {} \nWords: {}".format(idx, topic )) | |
print("\n") | |
topic_modeling(cleaned_reviews) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment