Skip to content

Instantly share code, notes, and snippets.

@ZechCodes
Last active September 25, 2020 01:25
Show Gist options
  • Save ZechCodes/8cb3ee3f8cc7ba08637d1f3c5182882f to your computer and use it in GitHub Desktop.
Save ZechCodes/8cb3ee3f8cc7ba08637d1f3c5182882f to your computer and use it in GitHub Desktop.
Messing around with NLP and extracting topics.
"""
Messing around with NLP and extracting topics.
It works acceptably. Results are mediocre, many legal texts get categorized as religious. Generally identifies
sports but they often get categorized as hockey. Likely too small of a training set.
Referenced and based heavily on this article
https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925
"""
from sklearn.datasets import fetch_20newsgroups
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
import os.path
def lemmatize_stemming(text, stemmer):
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text, stemmer):
return [lemmatize_stemming(token, stemmer)
for token in gensim.utils.simple_preprocess(text)
if len(token) > 3 and token not in gensim.parsing.preprocessing.STOPWORDS]
np.random.seed(400)
stemmer = SnowballStemmer("english")
if not os.path.exists("dictionary.txt") or not os.path.exists("lda-model.txt"):
print("Get newsgroups training data")
newsgroups_train = fetch_20newsgroups(subset="train", shuffle=True)
newsgroups_test = fetch_20newsgroups(subset="test", shuffle=True)
nltk.download('wordnet')
print("Process Documents")
processed_docs = [preprocess(doc, stemmer) for doc in newsgroups_train.data]
print("Building Dictionary")
dictionary = gensim.corpora.Dictionary(processed_docs)
print("- Filter Extremes")
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)
print("- Save Dictionary")
with open("dictionary.txt", "wb") as write_to:
dictionary.save(write_to)
print("- Saved Dictionary")
print("Building Model")
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
lda_model = gensim.models.LdaMulticore(
bow_corpus,
num_topics=8,
id2word=dictionary,
passes=10,
workers=2
)
print("- Save Model")
lda_model.save("lda-model.txt")
print("- Model Saved")
else:
print("Loading Dictionary")
dictionary = gensim.corpora.Dictionary.load("dictionary.txt")
print("- Dictionary Loaded")
print("Loading Model")
lda_model = gensim.models.LdaMulticore.load("lda-model.txt")
print("- Loaded Model")
print("\nAnalyze Document")
num = 100
# Text from https://en.wikipedia.org/wiki/2019_Cricket_World_Cup_knockout_stage
unseen_document = ("The knockout stage of the 2019 Cricket World Cup will see "
"two semi-finals, with the winners of each progressing to "
"the final at Lord's. The first semi-final will be held at "
"Old Trafford in Manchester and the second semi-final will "
"be held at Edgbaston in Birmingham just as they did back "
"in 1999, with all of the knockout games having a reserve "
"day. It will be the third time Edgbaston has hosted a "
"World Cup semi-final and the fourth semi-final to be held "
"at Old Trafford - a record for a World Cup venue. The "
"final will be held at Lord's in London for a record fifth "
"time.\n\nOn 25 June 2019, Australia became the first team "
"to qualify for the semi-finals, after beating England at "
"Lord's. India became the second team to qualify for the "
"semi-finals, after they defeated Bangladesh at Edgbaston "
"on 2 July 2019. The following day saw tournament hosts "
"England become the third team to qualify for the "
"semi-finals, after they beat New Zealand at the Riverside "
"Ground. New Zealand were the fourth and final team to "
"qualify for the semi-finals, after Pakistan were unable to "
"increase their net run rate sufficiently enough in their "
"match against Bangladesh at Lord's.")
bow_vector = dictionary.doc2bow(preprocess(unseen_document, stemmer))
print("Document\n'", unseen_document, "'\n", sep="")
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
boto==2.49.0
boto3==1.9.183
botocore==1.12.183
certifi==2019.6.16
chardet==3.0.4
docutils==0.14
gensim==3.7.3
idna==2.8
jmespath==0.9.4
joblib==0.13.2
nltk==3.4.4
numpy==1.16.4
pandas==0.24.2
python-dateutil==2.8.0
pytz==2019.1
requests==2.22.0
s3transfer==0.2.1
scikit-learn==0.21.2
scipy==1.3.0
six==1.12.0
sklearn==0.0
smart-open==1.8.4
urllib3==1.25.3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment