Created
May 31, 2022 21:01
-
-
Save Steboss89/140e67c9c40adf17bf9f6366f90e4760 to your computer and use it in GitHub Desktop.
LDA on Old Testament books
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def format_topics_sentences(ldamodel, corpus): | |
| r"""This function associate to each review the dominant topic | |
| Parameters | |
| ---------- | |
| lda_model: gensim lda_model | |
| The current lda model calculated | |
| corpus: gensim corpus | |
| this is the corpus from the reviews | |
| texts: list | |
| list of words of each review | |
| real_text: list | |
| list of real comments | |
| Return | |
| ------ | |
| """ | |
| topics_df = pd.DataFrame() | |
| # Get main topic in each document | |
| for i, row in enumerate(ldamodel[corpus]): #from the corpus rebuild the reviews | |
| row = sorted(row[0], key=lambda x: (x[1]), reverse=True) | |
| # Get the Dominant topic, Perc Contribution and Keywords for each document | |
| for j, (topic_num, prop_topic) in enumerate(row): | |
| if j == 0: # => dominant topic | |
| wp = ldamodel.show_topic(topic_num) #topic + weights | |
| topic_keywords = ", ".join([word for word, prop in wp]) #topic keyword only | |
| #prop_topic is the percentage of similarity of that topic | |
| topics_df = topics_df.append(pd.Series([int(topic_num),\ | |
| round(prop_topic,2), topic_keywords]), ignore_index=True) | |
| #round approximate the prop_topic to 2 decimals | |
| else: | |
| break | |
| topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] | |
| return(topics_df) | |
| def run_lda(cleaned_comments, num_topics, chunksize): | |
| r"""This is the main function which computes the LDA | |
| Parameters | |
| ---------- | |
| cleaned_comments: "further_cleaning" in dataframe | |
| comments: "comments" in dataframe | |
| save_path: option whhere to save the output | |
| num_topic: number of topics | |
| chunksize: the chunk size of each comment | |
| Return | |
| ------ | |
| lda_model: Gensim | |
| LDA model | |
| """ | |
| #tokenize | |
| data_words = [] | |
| for sentence in cleaned_comments: | |
| data_words.append(simple_preprocess(str(sentence),deacc=True))#deacc remove punctuation | |
| # Create Dictionary | |
| id2word = Dictionary(data_words) #this create an index for each word | |
| #e.g. id2word[0] = "allowed" | |
| # Create Corpus | |
| texts = data_words | |
| # Term Document Frequency | |
| corpus = [id2word.doc2bow(text) for text in texts]#bag of words | |
| #corpus gives the frequency of a word in a document (a document == a single review) | |
| # Build LDA model | |
| #creation of lda with X topics and representation | |
| print("Computing LDA with {} topics, {} chunksize...".format(num_topics, chunksize)) | |
| # gensim.models.ldamodel.LdaModel | |
| lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, | |
| id2word=id2word, | |
| num_topics=num_topics, | |
| random_state=42, | |
| eval_every=100, | |
| chunksize=chunksize, | |
| passes=5, | |
| iterations=400, | |
| per_word_topics=True) | |
| print("Writing classification onto csv file...") | |
| df_topic_sents_keywords = format_topics_sentences(lda_model, corpus) | |
| print("Topic Keywords") | |
| print(df_topic_sents_keywords["Topic_Keywords"].unique()) | |
| print(f"Perplexity {lda_model.log_perplexity(corpus)}") | |
| coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v') | |
| coherence_lda = coherence_model_lda.get_coherence() | |
| print(f"Coherence {coherence_lda}") | |
| return lda_model, df_topic_sents_keywords, corpus | |
| # %% | |
| num_topics = [2, 3, 5, 6] | |
| chunksizes = [20, 50] | |
| for num_topic in num_topics: | |
| for chunksize in chunksizes: | |
| print(f"!!!!!! Num Topic {num_topic} and chunksize {chunksize}") | |
| lda_model, df_lda, corpus = run_lda(data, | |
| num_topic, | |
| chunksize ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment