-
-
Save dmd/38070a18397c9fabeb45cf8bdfd3f572 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # Read in all the word frequencies by person. | |
| import re | |
| import sys | |
| from collections import defaultdict, Counter | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn import svm | |
| import numpy as np | |
| from itertools import chain | |
| import codecs | |
| import time | |
| filename = 'allmefitapes.txt' | |
| freqs = defaultdict(dict) | |
| # regexp to parse irclog lines | |
| r = re.compile(r'\d\d:\d\d *<[@ ]?(.*?)> (.*)', re.UNICODE) | |
| # regexp to clean lines into tokens | |
| tokenizer = re.compile('[\W_]+') | |
| # create the frequency table | |
| with open(filename) as infile: | |
| for line in infile: | |
| try: | |
| nick, text = r.search(line.rstrip()).group(1,2) | |
| nick = tokenizer.sub('', text.lower()) | |
| text = tokenizer.sub(' ', text.lower()) | |
| for word in text.split(): | |
| try: | |
| freqs[nick][word] += 1 | |
| except KeyError: | |
| freqs[nick][word] = 1 | |
| except AttributeError: | |
| pass | |
| # Drop words used less than MINUSE times. | |
| MINUSE=50 | |
| freqs = {person: {k: v for k,v in freqs[person].items() if v > MINUSE} for person in freqs.keys()} | |
| # sklearn expects a list of lines with words. There's almost certainly a better and more direct way to do this but this works. | |
| nicks = sorted(freqs.keys()) | |
| trainlines = '\n'.join([' '.join(list(Counter(freqs[nick]).elements())) for nick in nicks]) | |
| word_vectorizer = CountVectorizer(analyzer='word',stop_words='english') | |
| trainset = word_vectorizer.fit_transform(trainlines.split('\n')) | |
| svc = svm.LinearSVC() | |
| svc.fit(trainset, nicks) | |
| def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10): | |
| labelid = list(classifier.classes_).index(classlabel) | |
| feature_names = vectorizer.get_feature_names() | |
| topn = sorted(zip(classifier.coef_[labelid], feature_names), reverse=True)[:n] | |
| for coef, feat in topn: | |
| print('<{}>\t{}\t{}'.format(classlabel, feat, coef)) | |
| for nick in nicks: | |
| most_informative_feature_for_class(word_vectorizer, svc, nick, 2) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment