Steboss89 · May 31, 2022 21:01
diff --git a/LDA_oldtest.py b/LDA_oldtest.py
 def format_topics_sentences(ldamodel, corpus):
    r"""This function associate to each review the dominant topic
    Parameters
    ----------
    lda_model:      gensim lda_model
                    The current lda model calculated

    corpus:         gensim corpus
                    this is the corpus from the reviews

    texts:          list
                    list of words of each review

    real_text:      list 
                    list of real comments 
                    
    Return
    ------
    """
    
    topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]): #from the corpus rebuild the reviews
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num) #topic + weights
                topic_keywords = ", ".join([word for word, prop in wp]) #topic keyword only
                #prop_topic is the percentage of similarity of that topic
                topics_df = topics_df.append(pd.Series([int(topic_num),\
                                            round(prop_topic,2), topic_keywords]), ignore_index=True)
                #round approximate the prop_topic to 2 decimals
            else:
                break

    topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    return(topics_df)


 def run_lda(cleaned_comments, num_topics, chunksize):
    r"""This is the main function which computes the LDA
    Parameters
    ----------
    cleaned_comments: "further_cleaning" in dataframe 
    comments: "comments" in dataframe 
    save_path: option whhere to save the output 
    num_topic: number of topics 
    chunksize: the chunk size of each comment

    Return
    ------
    lda_model:          Gensim
                        LDA model
    """

    #tokenize
    data_words = []
    for sentence in cleaned_comments:
        data_words.append(simple_preprocess(str(sentence),deacc=True))#deacc remove punctuation

    # Create Dictionary
    id2word = Dictionary(data_words)  #this create an index for each word
    #e.g. id2word[0] = "allowed"
    # Create Corpus
    texts = data_words
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]#bag of words
    #corpus gives the frequency of a word in a document (a document == a single review)
    # Build LDA model

    #creation of lda with X topics and representation
    print("Computing LDA with  {} topics, {} chunksize...".format(num_topics, chunksize))
    # gensim.models.ldamodel.LdaModel
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                       id2word=id2word,
                       num_topics=num_topics,
                       random_state=42,
                       eval_every=100,
                       chunksize=chunksize,
                       passes=5,
                       iterations=400,
                       per_word_topics=True)

    print("Writing classification onto csv file...")
    df_topic_sents_keywords = format_topics_sentences(lda_model, corpus)
    print("Topic Keywords")
    print(df_topic_sents_keywords["Topic_Keywords"].unique())
    print(f"Perplexity {lda_model.log_perplexity(corpus)}")
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Coherence {coherence_lda}")
    return lda_model, df_topic_sents_keywords, corpus


 # %%
 num_topics = [2, 3, 5, 6]
 chunksizes = [20, 50]

 for num_topic in num_topics:
    for chunksize in chunksizes:
        print(f"!!!!!! Num Topic {num_topic} and chunksize {chunksize}")
        lda_model, df_lda, corpus = run_lda(data,
                                    num_topic,
                                    chunksize )
	def format_topics_sentences(ldamodel, corpus):
	r"""This function associate to each review the dominant topic
	Parameters
	----------
	lda_model: gensim lda_model
	The current lda model calculated

	corpus: gensim corpus
	this is the corpus from the reviews

	texts: list
	list of words of each review

	real_text: list
	list of real comments

	Return
	------
	"""

	topics_df = pd.DataFrame()
	# Get main topic in each document
	for i, row in enumerate(ldamodel[corpus]): #from the corpus rebuild the reviews
	row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
	# Get the Dominant topic, Perc Contribution and Keywords for each document
	for j, (topic_num, prop_topic) in enumerate(row):
	if j == 0: # => dominant topic
	wp = ldamodel.show_topic(topic_num) #topic + weights
	topic_keywords = ", ".join([word for word, prop in wp]) #topic keyword only
	#prop_topic is the percentage of similarity of that topic
	topics_df = topics_df.append(pd.Series([int(topic_num),\
	round(prop_topic,2), topic_keywords]), ignore_index=True)
	#round approximate the prop_topic to 2 decimals
	else:
	break

	topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
	return(topics_df)


	def run_lda(cleaned_comments, num_topics, chunksize):
	r"""This is the main function which computes the LDA
	Parameters
	----------
	cleaned_comments: "further_cleaning" in dataframe
	comments: "comments" in dataframe
	save_path: option whhere to save the output
	num_topic: number of topics
	chunksize: the chunk size of each comment

	Return
	------
	lda_model: Gensim
	LDA model
	"""

	#tokenize
	data_words = []
	for sentence in cleaned_comments:
	data_words.append(simple_preprocess(str(sentence),deacc=True))#deacc remove punctuation

	# Create Dictionary
	id2word = Dictionary(data_words) #this create an index for each word
	#e.g. id2word[0] = "allowed"
	# Create Corpus
	texts = data_words
	# Term Document Frequency
	corpus = [id2word.doc2bow(text) for text in texts]#bag of words
	#corpus gives the frequency of a word in a document (a document == a single review)
	# Build LDA model

	#creation of lda with X topics and representation
	print("Computing LDA with {} topics, {} chunksize...".format(num_topics, chunksize))
	# gensim.models.ldamodel.LdaModel
	lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
	id2word=id2word,
	num_topics=num_topics,
	random_state=42,
	eval_every=100,
	chunksize=chunksize,
	passes=5,
	iterations=400,
	per_word_topics=True)

	print("Writing classification onto csv file...")
	df_topic_sents_keywords = format_topics_sentences(lda_model, corpus)
	print("Topic Keywords")
	print(df_topic_sents_keywords["Topic_Keywords"].unique())
	print(f"Perplexity {lda_model.log_perplexity(corpus)}")
	coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
	coherence_lda = coherence_model_lda.get_coherence()
	print(f"Coherence {coherence_lda}")
	return lda_model, df_topic_sents_keywords, corpus


	# %%
	num_topics = [2, 3, 5, 6]
	chunksizes = [20, 50]

	for num_topic in num_topics:
	for chunksize in chunksizes:
	print(f"!!!!!! Num Topic {num_topic} and chunksize {chunksize}")
	lda_model, df_lda, corpus = run_lda(data,
	num_topic,
	chunksize )