Last active
September 25, 2020 01:25
-
-
Save ZechCodes/8cb3ee3f8cc7ba08637d1f3c5182882f to your computer and use it in GitHub Desktop.
Messing around with NLP and extracting topics.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Messing around with NLP and extracting topics. | |
It works acceptably. Results are mediocre, many legal texts get categorized as religious. Generally identifies | |
sports but they often get categorized as hockey. Likely too small of a training set. | |
Referenced and based heavily on this article | |
https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925 | |
""" | |
from sklearn.datasets import fetch_20newsgroups | |
import gensim | |
from gensim.utils import simple_preprocess | |
from gensim.parsing.preprocessing import STOPWORDS | |
from nltk.stem import WordNetLemmatizer, SnowballStemmer | |
from nltk.stem.porter import * | |
import numpy as np | |
import nltk | |
import os.path | |
def lemmatize_stemming(text, stemmer): | |
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) | |
# Tokenize and lemmatize | |
def preprocess(text, stemmer): | |
return [lemmatize_stemming(token, stemmer) | |
for token in gensim.utils.simple_preprocess(text) | |
if len(token) > 3 and token not in gensim.parsing.preprocessing.STOPWORDS] | |
np.random.seed(400) | |
stemmer = SnowballStemmer("english") | |
if not os.path.exists("dictionary.txt") or not os.path.exists("lda-model.txt"): | |
print("Get newsgroups training data") | |
newsgroups_train = fetch_20newsgroups(subset="train", shuffle=True) | |
newsgroups_test = fetch_20newsgroups(subset="test", shuffle=True) | |
nltk.download('wordnet') | |
print("Process Documents") | |
processed_docs = [preprocess(doc, stemmer) for doc in newsgroups_train.data] | |
print("Building Dictionary") | |
dictionary = gensim.corpora.Dictionary(processed_docs) | |
print("- Filter Extremes") | |
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000) | |
print("- Save Dictionary") | |
with open("dictionary.txt", "wb") as write_to: | |
dictionary.save(write_to) | |
print("- Saved Dictionary") | |
print("Building Model") | |
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] | |
lda_model = gensim.models.LdaMulticore( | |
bow_corpus, | |
num_topics=8, | |
id2word=dictionary, | |
passes=10, | |
workers=2 | |
) | |
print("- Save Model") | |
lda_model.save("lda-model.txt") | |
print("- Model Saved") | |
else: | |
print("Loading Dictionary") | |
dictionary = gensim.corpora.Dictionary.load("dictionary.txt") | |
print("- Dictionary Loaded") | |
print("Loading Model") | |
lda_model = gensim.models.LdaMulticore.load("lda-model.txt") | |
print("- Loaded Model") | |
print("\nAnalyze Document") | |
num = 100 | |
# Text from https://en.wikipedia.org/wiki/2019_Cricket_World_Cup_knockout_stage | |
unseen_document = ("The knockout stage of the 2019 Cricket World Cup will see " | |
"two semi-finals, with the winners of each progressing to " | |
"the final at Lord's. The first semi-final will be held at " | |
"Old Trafford in Manchester and the second semi-final will " | |
"be held at Edgbaston in Birmingham just as they did back " | |
"in 1999, with all of the knockout games having a reserve " | |
"day. It will be the third time Edgbaston has hosted a " | |
"World Cup semi-final and the fourth semi-final to be held " | |
"at Old Trafford - a record for a World Cup venue. The " | |
"final will be held at Lord's in London for a record fifth " | |
"time.\n\nOn 25 June 2019, Australia became the first team " | |
"to qualify for the semi-finals, after beating England at " | |
"Lord's. India became the second team to qualify for the " | |
"semi-finals, after they defeated Bangladesh at Edgbaston " | |
"on 2 July 2019. The following day saw tournament hosts " | |
"England become the third team to qualify for the " | |
"semi-finals, after they beat New Zealand at the Riverside " | |
"Ground. New Zealand were the fourth and final team to " | |
"qualify for the semi-finals, after Pakistan were unable to " | |
"increase their net run rate sufficiently enough in their " | |
"match against Bangladesh at Lord's.") | |
bow_vector = dictionary.doc2bow(preprocess(unseen_document, stemmer)) | |
print("Document\n'", unseen_document, "'\n", sep="") | |
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]): | |
print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
boto==2.49.0 | |
boto3==1.9.183 | |
botocore==1.12.183 | |
certifi==2019.6.16 | |
chardet==3.0.4 | |
docutils==0.14 | |
gensim==3.7.3 | |
idna==2.8 | |
jmespath==0.9.4 | |
joblib==0.13.2 | |
nltk==3.4.4 | |
numpy==1.16.4 | |
pandas==0.24.2 | |
python-dateutil==2.8.0 | |
pytz==2019.1 | |
requests==2.22.0 | |
s3transfer==0.2.1 | |
scikit-learn==0.21.2 | |
scipy==1.3.0 | |
six==1.12.0 | |
sklearn==0.0 | |
smart-open==1.8.4 | |
urllib3==1.25.3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment