Skip to content

Instantly share code, notes, and snippets.

@dmd

dmd/-

Created October 5, 2016 16:59
Show Gist options
  • Select an option

  • Save dmd/38070a18397c9fabeb45cf8bdfd3f572 to your computer and use it in GitHub Desktop.

Select an option

Save dmd/38070a18397c9fabeb45cf8bdfd3f572 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
# Read in all the word frequencies by person.
import re
import sys
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import numpy as np
from itertools import chain
import codecs
import time
filename = 'allmefitapes.txt'
freqs = defaultdict(dict)
# regexp to parse irclog lines
r = re.compile(r'\d\d:\d\d *<[@ ]?(.*?)> (.*)', re.UNICODE)
# regexp to clean lines into tokens
tokenizer = re.compile('[\W_]+')
# create the frequency table
with open(filename) as infile:
for line in infile:
try:
nick, text = r.search(line.rstrip()).group(1,2)
nick = tokenizer.sub('', text.lower())
text = tokenizer.sub(' ', text.lower())
for word in text.split():
try:
freqs[nick][word] += 1
except KeyError:
freqs[nick][word] = 1
except AttributeError:
pass
# Drop words used less than MINUSE times.
MINUSE=50
freqs = {person: {k: v for k,v in freqs[person].items() if v > MINUSE} for person in freqs.keys()}
# sklearn expects a list of lines with words. There's almost certainly a better and more direct way to do this but this works.
nicks = sorted(freqs.keys())
trainlines = '\n'.join([' '.join(list(Counter(freqs[nick]).elements())) for nick in nicks])
word_vectorizer = CountVectorizer(analyzer='word',stop_words='english')
trainset = word_vectorizer.fit_transform(trainlines.split('\n'))
svc = svm.LinearSVC()
svc.fit(trainset, nicks)
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
labelid = list(classifier.classes_).index(classlabel)
feature_names = vectorizer.get_feature_names()
topn = sorted(zip(classifier.coef_[labelid], feature_names), reverse=True)[:n]
for coef, feat in topn:
print('<{}>\t{}\t{}'.format(classlabel, feat, coef))
for nick in nicks:
most_informative_feature_for_class(word_vectorizer, svc, nick, 2)
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment