ZechCodes · September 25, 2020 01:25
diff --git a/nlp-extract-topics.py b/nlp-extract-topics.py
 """
 Messing around with NLP and extracting topics.

 It works acceptably. Results are mediocre, many legal texts get categorized as religious. Generally identifies
 sports but they often get categorized as hockey. Likely too small of a training set.

 Referenced and based heavily on this article
 https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925
 """

 from sklearn.datasets import fetch_20newsgroups
 import gensim
 from gensim.utils import simple_preprocess
 from gensim.parsing.preprocessing import STOPWORDS
 from nltk.stem import WordNetLemmatizer, SnowballStemmer
 from nltk.stem.porter import *
 import numpy as np
 import nltk
 import os.path


 def lemmatize_stemming(text, stemmer):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


 # Tokenize and lemmatize
 def preprocess(text, stemmer):
    return [lemmatize_stemming(token, stemmer)
            for token in gensim.utils.simple_preprocess(text)
            if len(token) > 3 and token not in gensim.parsing.preprocessing.STOPWORDS]


 np.random.seed(400)
 stemmer = SnowballStemmer("english")

 if not os.path.exists("dictionary.txt") or not os.path.exists("lda-model.txt"):
    print("Get newsgroups training data")
    newsgroups_train = fetch_20newsgroups(subset="train", shuffle=True)
    newsgroups_test = fetch_20newsgroups(subset="test", shuffle=True)

    nltk.download('wordnet')

    print("Process Documents")
    processed_docs = [preprocess(doc, stemmer) for doc in newsgroups_train.data]

    print("Building Dictionary")
    dictionary = gensim.corpora.Dictionary(processed_docs)
    print("- Filter Extremes")
    dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)
    print("- Save Dictionary")
    with open("dictionary.txt", "wb") as write_to:
        dictionary.save(write_to)
    print("- Saved Dictionary")

    print("Building Model")
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    lda_model = gensim.models.LdaMulticore(
        bow_corpus,
        num_topics=8,
        id2word=dictionary,
        passes=10,
        workers=2
    )
    print("- Save Model")
    lda_model.save("lda-model.txt")
    print("- Model Saved")
 else:
    print("Loading Dictionary")
    dictionary = gensim.corpora.Dictionary.load("dictionary.txt")
    print("- Dictionary Loaded")
    print("Loading Model")
    lda_model = gensim.models.LdaMulticore.load("lda-model.txt")
    print("- Loaded Model")

 print("\nAnalyze Document")

 num = 100
 # Text from https://en.wikipedia.org/wiki/2019_Cricket_World_Cup_knockout_stage
 unseen_document = ("The knockout stage of the 2019 Cricket World Cup will see "
                   "two semi-finals, with the winners of each progressing to "
                   "the final at Lord's. The first semi-final will be held at "
                   "Old Trafford in Manchester and the second semi-final will "
                   "be held at Edgbaston in Birmingham just as they did back "
                   "in 1999, with all of the knockout games having a reserve "
                   "day. It will be the third time Edgbaston has hosted a "
                   "World Cup semi-final and the fourth semi-final to be held "
                   "at Old Trafford - a record for a World Cup venue. The "
                   "final will be held at Lord's in London for a record fifth "
                   "time.\n\nOn 25 June 2019, Australia became the first team "
                   "to qualify for the semi-finals, after beating England at "
                   "Lord's. India became the second team to qualify for the "
                   "semi-finals, after they defeated Bangladesh at Edgbaston "
                   "on 2 July 2019. The following day saw tournament hosts "
                   "England become the third team to qualify for the "
                   "semi-finals, after they beat New Zealand at the Riverside "
                   "Ground. New Zealand were the fourth and final team to "
                   "qualify for the semi-finals, after Pakistan were unable to "
                   "increase their net run rate sufficiently enough in their "
                   "match against Bangladesh at Lord's.")
 bow_vector = dictionary.doc2bow(preprocess(unseen_document, stemmer))
 print("Document\n'", unseen_document, "'\n", sep="")
 for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
diff --git a/requirements.txt b/requirements.txt

 boto==2.49.0
 boto3==1.9.183
 botocore==1.12.183
 certifi==2019.6.16
 chardet==3.0.4
 docutils==0.14
 gensim==3.7.3
 idna==2.8
 jmespath==0.9.4
 joblib==0.13.2
 nltk==3.4.4
 numpy==1.16.4
 pandas==0.24.2
 python-dateutil==2.8.0
 pytz==2019.1
 requests==2.22.0
 s3transfer==0.2.1
 scikit-learn==0.21.2
 scipy==1.3.0
 six==1.12.0
 sklearn==0.0
 smart-open==1.8.4
 urllib3==1.25.3
	"""
	Messing around with NLP and extracting topics.

	It works acceptably. Results are mediocre, many legal texts get categorized as religious. Generally identifies
	sports but they often get categorized as hockey. Likely too small of a training set.

	Referenced and based heavily on this article
	https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925
	"""

	from sklearn.datasets import fetch_20newsgroups
	import gensim
	from gensim.utils import simple_preprocess
	from gensim.parsing.preprocessing import STOPWORDS
	from nltk.stem import WordNetLemmatizer, SnowballStemmer
	from nltk.stem.porter import *
	import numpy as np
	import nltk
	import os.path


	def lemmatize_stemming(text, stemmer):
	return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


	# Tokenize and lemmatize
	def preprocess(text, stemmer):
	return [lemmatize_stemming(token, stemmer)
	for token in gensim.utils.simple_preprocess(text)
	if len(token) > 3 and token not in gensim.parsing.preprocessing.STOPWORDS]


	np.random.seed(400)
	stemmer = SnowballStemmer("english")

	if not os.path.exists("dictionary.txt") or not os.path.exists("lda-model.txt"):
	print("Get newsgroups training data")
	newsgroups_train = fetch_20newsgroups(subset="train", shuffle=True)
	newsgroups_test = fetch_20newsgroups(subset="test", shuffle=True)

	nltk.download('wordnet')

	print("Process Documents")
	processed_docs = [preprocess(doc, stemmer) for doc in newsgroups_train.data]

	print("Building Dictionary")
	dictionary = gensim.corpora.Dictionary(processed_docs)
	print("- Filter Extremes")
	dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)
	print("- Save Dictionary")
	with open("dictionary.txt", "wb") as write_to:
	dictionary.save(write_to)
	print("- Saved Dictionary")

	print("Building Model")
	bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
	lda_model = gensim.models.LdaMulticore(
	bow_corpus,
	num_topics=8,
	id2word=dictionary,
	passes=10,
	workers=2
	)
	print("- Save Model")
	lda_model.save("lda-model.txt")
	print("- Model Saved")
	else:
	print("Loading Dictionary")
	dictionary = gensim.corpora.Dictionary.load("dictionary.txt")
	print("- Dictionary Loaded")
	print("Loading Model")
	lda_model = gensim.models.LdaMulticore.load("lda-model.txt")
	print("- Loaded Model")

	print("\nAnalyze Document")

	num = 100
	# Text from https://en.wikipedia.org/wiki/2019_Cricket_World_Cup_knockout_stage
	unseen_document = ("The knockout stage of the 2019 Cricket World Cup will see "
	"two semi-finals, with the winners of each progressing to "
	"the final at Lord's. The first semi-final will be held at "
	"Old Trafford in Manchester and the second semi-final will "
	"be held at Edgbaston in Birmingham just as they did back "
	"in 1999, with all of the knockout games having a reserve "
	"day. It will be the third time Edgbaston has hosted a "
	"World Cup semi-final and the fourth semi-final to be held "
	"at Old Trafford - a record for a World Cup venue. The "
	"final will be held at Lord's in London for a record fifth "
	"time.\n\nOn 25 June 2019, Australia became the first team "
	"to qualify for the semi-finals, after beating England at "
	"Lord's. India became the second team to qualify for the "
	"semi-finals, after they defeated Bangladesh at Edgbaston "
	"on 2 July 2019. The following day saw tournament hosts "
	"England become the third team to qualify for the "
	"semi-finals, after they beat New Zealand at the Riverside "
	"Ground. New Zealand were the fourth and final team to "
	"qualify for the semi-finals, after Pakistan were unable to "
	"increase their net run rate sufficiently enough in their "
	"match against Bangladesh at Lord's.")
	bow_vector = dictionary.doc2bow(preprocess(unseen_document, stemmer))
	print("Document\n'", unseen_document, "'\n", sep="")
	for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
	print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

	boto==2.49.0
	boto3==1.9.183
	botocore==1.12.183
	certifi==2019.6.16
	chardet==3.0.4
	docutils==0.14
	gensim==3.7.3
	idna==2.8
	jmespath==0.9.4
	joblib==0.13.2
	nltk==3.4.4
	numpy==1.16.4
	pandas==0.24.2
	python-dateutil==2.8.0
	pytz==2019.1
	requests==2.22.0
	s3transfer==0.2.1
	scikit-learn==0.21.2
	scipy==1.3.0
	six==1.12.0
	sklearn==0.0
	smart-open==1.8.4
	urllib3==1.25.3