Created
December 21, 2016 02:10
-
-
Save light44/b6e374d172bbe3e6130f6849e7bd6c13 to your computer and use it in GitHub Desktop.
Latent Dirichlet Allocation using gensim
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import pandas as pd | |
import re | |
import pprint | |
import operator | |
import csv | |
import logging | |
from stop_words import get_stop_words | |
from collections import defaultdict | |
from gensim import corpora | |
from gensim.models import ldamodel | |
from nltk.stem import WordNetLemmatizer | |
# constants | |
STOPWORDS = set(get_stop_words('en')) | |
CUSTOM_STOPWORDS = {'light', 'lights', 'lights', 'sky', 'object', 'bright', 'ufo', 'quot'} | |
pp = pprint.PrettyPrinter(indent=4) | |
regex_filter = re.compile('[a-z]{2,}') | |
# put your custom path here if you so choose | |
nltk.data.path.append('') | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
def tokenize_and_clean(document, stopwords=(), regex=[], lemmatizer=WordNetLemmatizer()): | |
""" | |
:param document: a string representing a single document | |
:param stopwords: a set of stopwords | |
:param regex: additional regular expressions to use as a filter. Assuming these are compiled prior | |
:param lemmatizer: an instance of an nltk lemmatizer | |
:return: a tokenized and filtered document | |
""" | |
raw_tokenized = nltk.tokenize.wordpunct_tokenize(document) | |
tokenized = [] | |
for word in raw_tokenized: | |
w = word.lower() | |
if w not in stopwords: | |
for exp in regex: | |
if re.match(exp,w): | |
if lemmatizer: | |
tokenized.append(lemmatizer.lemmatize(w)) | |
else: | |
tokenized.append(w) | |
return tokenized | |
def word_frequency(corpus=[[]]): | |
""" | |
:param corpus: a list of lists representing tokenized documents | |
:return: a dict containing the frequency of each word in the corpus | |
""" | |
frequency = defaultdict(int) | |
for doc in corpus: | |
for w in doc: | |
frequency[w] += 1 | |
return dict(sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)) | |
def write_dict_to_csv(data, filepath): | |
""" | |
Encapsulating this in a function - writes an object to a csv | |
:param data: a dict containing your data | |
:param filepath: the filepath for your csv file | |
""" | |
with open(filepath, 'wb') as csv_file: | |
writer = csv.writer(csv_file) | |
for key, value in data: | |
writer.writerow([key, value]) | |
# reading in the raw file - there are other interesting data that we won't analyze at this time | |
raw = pd.read_csv('./data/raw.csv', usecols=[7], names=['description']) | |
# a dict for our document corpus | |
corpus = [] | |
for i, row in raw.iterrows(): | |
corpus.append(row[0]) | |
tokenized_corpus = [] | |
for doc in corpus: | |
try: | |
tokenized_corpus.append(tokenize_and_clean(document=doc, stopwords=STOPWORDS.union(CUSTOM_STOPWORDS), regex=[regex_filter])) | |
except: | |
pass | |
freq = word_frequency(tokenized_corpus) | |
# filtering words based off of low frequency < 10 instances (mispellings, rare words) and removing | |
# high frequency words that don't provide a lot of discrimination between documents | |
tokenized_final = [[token for token in doc if freq[token] > 10] for doc in tokenized_corpus] | |
# creating a vocabulary of words from this corpus for streaming use | |
vocabulary = corpora.Dictionary(tokenized_final) | |
# save to disk | |
vocabulary.save('data/vocabulary.dict') | |
print(vocabulary) | |
# creating an mm corpus | |
corpus = [vocabulary.doc2bow(text) for text in tokenized_final] | |
corpora.MmCorpus.serialize('data/ufo.mm', corpus) | |
ufo_corpus = corpora.MmCorpus('data/ufo.mm') | |
lda = ldamodel.LdaModel(corpus=ufo_corpus,alpha='auto', id2word=vocabulary, num_topics=20, update_every=0, passes=20) | |
with open('data/lda_topics', 'w') as file: | |
file.write(str(lda.print_topics(-1))) | |
lda.print_topics(-1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment