kkew3 · December 23, 2023 03:42
diff --git a/eval_coherence_gensim_sklearn_lda.py b/eval_coherence_gensim_sklearn_lda.py
 from collections import Counter
 from typing import Dict, Union, List

 import numpy as np
 from scipy import sparse
 import pandas as pd
 import spacy
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.decomposition import LatentDirichletAllocation
 from gensim.models.coherencemodel import CoherenceModel


 nlp = spacy.load('en_core_web_md', disable=['ner', 'parser'])
 corpus = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')).data


 def map_filter_words(doc):
    """Filtering and lemmatization."""
    for word in doc:
        if word.is_alpha and not word.is_stop:
            yield word.lemma_


 texts = []  # tokenized corpus
 tf = Counter()  # global term frequency
 for doc in map(nlp, corpus):
    tf.update(map_filter_words(doc))
    texts.append(list(map_filter_words(doc)))
 vocab = sorted(tf)  # the vocabulary
 doc_word = sparse.lil_matrix((len(texts), len(vocab)), dtype=int)
 for i, doc in enumerate(texts):
    doc_tf = Counter(doc)  # term frequency per document
    r = pd.Series(doc_tf).reindex(vocab).fillna(0).astype(int)
    doc_word[i] = r.to_numpy()
 doc_word = doc_word.tocsr()

 lda = LatentDirichletAllocation()
 lda.fit(doc_word)


 ### NOTE HERE
 class DummyTopicModel:
    """Fake a topic model for gensim"""
    def __init__(self, lam):
        """lam: the variational parameters for topic-word distribution"""
        self.lam = lam / np.sum(lam, axis=1, keepdims=True)

    def get_topics(self):
        return self.lam


 ### NOTE HERE
 class DummyDictionary:
    def __init__(self, vocab: ty.List[str]):
        self.token2id = {w: j for j, w in enumerate(vocab)}
        self.id2token = vocab.copy()

    def __getitem__(self, item):
        return self.id2token[item]

    def __contains__(self, item):
        if isinstance(item, int):
            return 0 <= item < len(self.id2token)
        return False


 cm = CoherenceModel(
    model=DummyTopicModel(lda.components_),
    texts=texts,
    dictionary=DummyDictionary(vocab),
    coherence='c_npmi',
 )
 coh = np.asarray(cm.get_coherence_per_topic())
 print('average topic coherence:', np.mean(coh))
	from collections import Counter
	from typing import Dict, Union, List

	import numpy as np
	from scipy import sparse
	import pandas as pd
	import spacy
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.decomposition import LatentDirichletAllocation
	from gensim.models.coherencemodel import CoherenceModel


	nlp = spacy.load('en_core_web_md', disable=['ner', 'parser'])
	corpus = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')).data


	def map_filter_words(doc):
	"""Filtering and lemmatization."""
	for word in doc:
	if word.is_alpha and not word.is_stop:
	yield word.lemma_


	texts = [] # tokenized corpus
	tf = Counter() # global term frequency
	for doc in map(nlp, corpus):
	tf.update(map_filter_words(doc))
	texts.append(list(map_filter_words(doc)))
	vocab = sorted(tf) # the vocabulary
	doc_word = sparse.lil_matrix((len(texts), len(vocab)), dtype=int)
	for i, doc in enumerate(texts):
	doc_tf = Counter(doc) # term frequency per document
	r = pd.Series(doc_tf).reindex(vocab).fillna(0).astype(int)
	doc_word[i] = r.to_numpy()
	doc_word = doc_word.tocsr()

	lda = LatentDirichletAllocation()
	lda.fit(doc_word)


	### NOTE HERE
	class DummyTopicModel:
	"""Fake a topic model for gensim"""
	def __init__(self, lam):
	"""lam: the variational parameters for topic-word distribution"""
	self.lam = lam / np.sum(lam, axis=1, keepdims=True)

	def get_topics(self):
	return self.lam


	### NOTE HERE
	class DummyDictionary:
	def __init__(self, vocab: ty.List[str]):
	self.token2id = {w: j for j, w in enumerate(vocab)}
	self.id2token = vocab.copy()

	def __getitem__(self, item):
	return self.id2token[item]

	def __contains__(self, item):
	if isinstance(item, int):
	return 0 <= item < len(self.id2token)
	return False


	cm = CoherenceModel(
	model=DummyTopicModel(lda.components_),
	texts=texts,
	dictionary=DummyDictionary(vocab),
	coherence='c_npmi',
	)
	coh = np.asarray(cm.get_coherence_per_topic())
	print('average topic coherence:', np.mean(coh))