Last active
August 2, 2019 13:51
-
-
Save bsolomon1124/4b75d08030b4e41b203c59b09224c8ab to your computer and use it in GitHub Desktop.
TFIDF -> NMF Topic-Term Extraction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import logging | |
| from typing import List, Tuple | |
| import numpy as np | |
| from scipy import sparse | |
| from sklearn.decomposition import NMF | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| logging.basicConfig( | |
| level=logging.DEBUG, format="%(created)f [%(levelname)s] %(message)s" | |
| ) | |
| def top_words( | |
| model, feature_names: List[str], n_top_words: int | |
| ) -> List[List[str]]: | |
| res = [] | |
| model.components_: np.ndarray | |
| for topic in model.components_: | |
| res.append( | |
| [ | |
| feature_names[i] | |
| for i in topic.argsort()[: -n_top_words - 1 : -1] | |
| ] | |
| ) | |
| return res | |
| def main(corpus): | |
| tfidf_vectorizer = TfidfVectorizer(stop_words="english") | |
| tfidf: sparse.csr_matrix = tfidf_vectorizer.fit_transform(corpus) | |
| logging.info( | |
| "Corpus contains %d documents; Vocabulary contains %d terms", | |
| *tfidf.shape | |
| ) | |
| nmf = NMF(n_components=4, random_state=444).fit(tfidf) | |
| topics = top_words( | |
| model=nmf, | |
| feature_names=tfidf_vectorizer.get_feature_names(), | |
| n_top_words=3, | |
| ) | |
| for i, words in enumerate(topics): | |
| print("Topic %2d:\t%s" % (i, words)) | |
| # Map documents back to their membership in the topic space | |
| # The array `pdist` is (n_samples, n_components); the rows are documents | |
| # in the corpus, and the columns are topics. The values are scores | |
| # of membership likelihood to that topic. | |
| pdist: np.ndarray = nmf.transform(tfidf) | |
| topic_membership: np.ndarray = pdist.argmax(axis=1) | |
| docs_seen: int = 0 | |
| for i, words in enumerate(topics): | |
| print("Topic %2d:\t%s" % (i, words)) | |
| print("Documents associated with this topic:") | |
| # Indices of documents in corpus for given topic | |
| idxs: np.ndarray = np.nonzero(topic_membership == i)[0] | |
| for i in idxs: | |
| print(corpus[i]) | |
| docs_seen += 1 | |
| print("-" * 79) | |
| assert docs_seen == len(corpus) | |
| if __name__ == "__main__": | |
| corpus: Tuple[str] = ( | |
| "It is the second round of sanctions by the administration after a botched attempt in March 2018 to fatally poison a former Russian military intelligence officer, Sergei Skripal, in the British town of Salisbury.", | |
| "The attack put Mr. Skripal and his daughter, Yulia, into a coma, and sickened at least three others. One of them, a British woman named Dawn Sturgess, died.", | |
| "On Monday, the top Democrat and Republican on the House Foreign Affairs Committee sent a joint letter to the White House threatening new congressional action to force the administration’s hand.", | |
| "But Mr. Skripal was significant in the eyes of one man — Vladimir V. Putin, an intelligence officer of the same age and training.", | |
| "That was Mr. Skripal’s story, he said: Always looking for side hustles. “By his psychological type, he was a materialist,” Mr. Ivanov said. “He simply loved money.”", | |
| "Vladimir V. Putin, another midcareer intelligence officer, was living through the same loss of status.", | |
| "“Moscow is silent,” an officer told him. He would recall that phrase again and again in the years that followed.", | |
| "“I think it was our Russia people at the C.I.A. who came up with his name,” Mr. Panetta said. “And he was added to the list.”", | |
| "Mr. Putin was becoming impatient with Mr. Medvedev’s cooperation with Mr. Obama.", | |
| "By that time the two men were boarding a train at Salisbury station, the first leg of their escape back to Moscow.", | |
| "As Yulia Skripal went through customs at Heathrow Airport and waited for her luggage, the two men, according to British investigators, were already in Salisbury, carrying out surveillance ahead of the attack.", | |
| "Surveillance footage of Mr. Skripal in a convenience store the month before he was poisoned in Salisbury, England.", | |
| "Yulia Skripal had something important to do in England.", | |
| "By that time the two men were boarding a train at Salisbury station, the first leg of their escape back to Moscow.", | |
| "Then last March, Mr. Skripal and his daughter Yulia were poisoned with a rare and deadly strain of nerve agent, known as Novichok, at their home in Britain. It nearly killed them both.", | |
| "In the years after his resettlement in England, Mr. Skripal lived openly in his adopted city of Salisbury, drinking at the local pubs and grilling sausages in his yard.", | |
| "Mr. Skripal’s Russian colleagues, though, might have viewed things differently.", | |
| "The investigation into the Skripal poisoning, known as Operation Wedana, will stand as a high-profile test of an investigative technique Britain has pioneered: accumulating mounds of visual data and sifting through it.", | |
| "Beyond that, Mr. Bayliss said, “there is a satisfaction of getting to the truth, to be able to prove to the Western world that the Russians did this.”", | |
| "“It’s a bit like a funnel, the top of the funnel has a vast amount going in, and by the time the liquid comes out at the bottom, it narrows down to a tiny stream,” Mr. Bayliss said.", | |
| ) | |
| main(corpus) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment