Created
September 30, 2016 18:20
-
-
Save metasyn/62d42bea353b25467b72248b9d8c15e1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# splunk search -> data -> topic Modeling | |
# xander johnson | |
from __future__ import division # python 2, so old school | |
# Splunk SDK | |
import splunklib.client as client | |
import splunklib.results as results | |
import numpy as np | |
import pandas as pd | |
# Topic Modeling | |
import gensim | |
import pyLDAvis | |
import pyLDAvis.gensim as pg | |
# Just for the stop words | |
import nltk | |
# Standard Libraries | |
import collections | |
import re | |
import splunklib.client as client | |
import splunklib.results as results | |
HOST = "conf-dcr4.splunkoxygen.com" | |
PORT = 8089 | |
USERNAME = "Johnson" | |
PASSWORD = "clear text passwords are the best" | |
# Create a Service instance and log in | |
service = client.connect( | |
host=HOST, | |
port=PORT, | |
username=USERNAME, | |
password=PASSWORD) | |
# In[7]: | |
kwargs_export = {"earliest_time": "-1mon", # | |
"latest_time": "now", | |
"search_mode": "normal"} | |
# Remember - Python uses """ to escape double quotes """ | |
searchquery_export = """search sourcetype=*session* | |
| xmlkv | |
| search detail=* | |
| rex mode=sed field=detail "s/&.+?;//g" | |
| stats count by detail""" | |
exportsearch_results = service.jobs.export(searchquery_export, **kwargs_export) | |
# Get the results and display them using the ResultsReader | |
reader = results.ResultsReader(exportsearch_results) | |
# Print whether results are a preview from a running search | |
print "is_preview = %s " % reader.is_preview | |
# change detail to the name of the field with the text | |
data = [r['detail'] for r in reader | |
if isinstance(r, dict)] | |
# # scrub a dub dub | |
# | |
# Stopwords, tokenization, lemmatization, oh my! | |
# Stop words are words like of, the, an , a, etc. | |
# We're going to want to remove them. | |
# Stop list file from http://www.ranks.nl/stopwords + nltk | |
stopword_file = open('./stopword.txt', 'r') | |
stopwords_raw = stopword_file.read() | |
stopword_file.close() | |
stopwords_list = [w for w in stopwords_raw.split()] | |
stopwords_list = stopwords_list + nltk.corpus.stopwords.words('english') | |
stopwords = list(set(stopwords_list)) | |
# these are things that somehow made it though in the end, people misspell things | |
stopwords.append('wa') | |
stopwords.append('ha') | |
stopwords.append('le') | |
stopwords.append('u') | |
stopwords.append('splunk') # every topic would have this | |
stopwords.append('customer') # | |
stopwords.append('data') | |
# len(stopwords) # yum | |
def scrub(text): | |
lines = text.splitlines() | |
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') | |
tokens = [t for t in tokenizer.tokenize(' '.join(lines))] | |
# Lemmatization turns things like "running" into "run" | |
lemmatizer = nltk.stem.WordNetLemmatizer() | |
clean_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if re.search(ur'^[a-zA-Z]+', token)] | |
# remove those stop words | |
clean = [w for w in clean_tokens if w not in stopwords] | |
return clean | |
clean_tokens = scrub(' '.join(data)) | |
df = pd.DataFrame(data, columns=['abstract']) | |
df['clean'] = df.abstract.map(lambda x: ' '.join(scrub(x))) | |
df.clean = df.clean.map(lambda x: x.decode('utf8', errors='ignore')) | |
# the actual topic modeling part | |
all_text = [d.split() for d in df.clean] # aka "texts" | |
gensim_d = gensim.corpora.Dictionary(all_text) # aka "dictionary" | |
corpus = [gensim_d.doc2bow(text) for text in all_text] | |
# try changing the num_topics around a bit | |
lda = gensim.models.ldamodel.LdaModel( | |
corpus=corpus, | |
id2word=gensim_d, | |
num_topics=8, | |
update_every=1, | |
chunksize=100, | |
passes=1) | |
vis = pyLDAvis.gensim.prepare(lda, corpus, gensim_d) | |
# jupyter notebook display | |
# import matplotlib.pyplot as plt | |
# %matplotlib inline | |
# pyLDAvis.display(vis) | |
# or just ave it as a file | |
with open('./conftopics_detail.html', 'w+') as f: | |
pyLDAvis.save_html(vis, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment