Skip to content

Instantly share code, notes, and snippets.

@Steboss89
Created June 1, 2022 21:01
Show Gist options
  • Save Steboss89/a222b91a47290b0de28887b6189971c1 to your computer and use it in GitHub Desktop.
Save Steboss89/a222b91a47290b0de28887b6189971c1 to your computer and use it in GitHub Desktop.
Run LDA on Old Testament
def format_topics_sentences(ldamodel, corpus):
r"""This function associate to each review the dominant topic
Parameters
----------
lda_model: gensim lda_model
The current lda model calculated
corpus: gensim corpus
this is the corpus from the reviews
texts: list
list of words of each review
real_text: list
list of real comments
Return
------
"""
topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]): #from the corpus rebuild the reviews
row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num) #topic + weights
topic_keywords = ", ".join([word for word, prop in wp]) #topic keyword only
#prop_topic is the percentage of similarity of that topic
topics_df = topics_df.append(pd.Series([int(topic_num),\
round(prop_topic,2), topic_keywords]), ignore_index=True)
#round approximate the prop_topic to 2 decimals
else:
break
topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
return(topics_df)
def run_lda(cleaned_comments, num_topics, chunksize):
r"""This is the main function which computes the LDA
Parameters
----------
cleaned_comments: "further_cleaning" in dataframe
comments: "comments" in dataframe
save_path: option whhere to save the output
num_topic: number of topics
chunksize: the chunk size of each comment
Return
------
lda_model: Gensim
LDA model
"""
#tokenize
data_words = []
for sentence in cleaned_comments:
data_words.append(simple_preprocess(str(sentence),deacc=True))#deacc remove punctuation
# Create Dictionary
id2word = Dictionary(data_words) #this create an index for each word
#e.g. id2word[0] = "allowed"
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]#bag of words
#corpus gives the frequency of a word in a document (a document == a single review)
# Build LDA model
#creation of lda with X topics and representation
print("Computing LDA with {} topics, {} chunksize...".format(num_topics, chunksize))
# gensim.models.ldamodel.LdaModel
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=num_topics,
random_state=42,
eval_every=100,
chunksize=chunksize,
passes=5,
iterations=400,
per_word_topics=True)
print("Writing classification onto csv file...")
df_topic_sents_keywords = format_topics_sentences(lda_model, corpus)
print("Topic Keywords")
print(df_topic_sents_keywords["Topic_Keywords"].unique())
print(f"Perplexity {lda_model.log_perplexity(corpus)}")
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence {coherence_lda}")
return lda_model, df_topic_sents_keywords, corpus
num_topics = [2, 3, 4, 5]
chunksizes = [20, 50, 100]
for num_topic in num_topics:
for chunksize in chunksizes:
print(f"!!!!!! Num Topic {num_topic} and chunksize {chunksize}")
lda_model, df_lda, corpus = run_lda(data,
num_topic,
chunksize )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment