dmd · October 5, 2016 16:59
diff --git a/- b/-
 #!/usr/bin/env python
 # coding: utf-8

 # Read in all the word frequencies by person.

 import re
 import sys
 from collections import defaultdict, Counter
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.naive_bayes import MultinomialNB
 from sklearn import svm
 import numpy as np
 from itertools import chain
 import codecs
 import time


 filename = 'allmefitapes.txt'

 freqs = defaultdict(dict)


 # regexp to parse irclog lines
 r = re.compile(r'\d\d:\d\d *<[@ ]?(.*?)> (.*)', re.UNICODE)
 # regexp to clean lines into tokens
 tokenizer = re.compile('[\W_]+')


 # create the frequency table
 with open(filename) as infile:
    for line in infile:
        try:
            nick, text = r.search(line.rstrip()).group(1,2)
            nick = tokenizer.sub('', text.lower())
            text = tokenizer.sub(' ', text.lower())
            for word in text.split():
                try:
                    freqs[nick][word] += 1
                except KeyError:
                    freqs[nick][word] = 1
        except AttributeError:
            pass


 # Drop words used less than MINUSE times.
 MINUSE=50

 freqs = {person: {k: v for k,v in freqs[person].items() if v > MINUSE} for person in freqs.keys()}


 # sklearn expects a list of lines with words. There's almost certainly a better and more direct way to do this but this works.

 nicks = sorted(freqs.keys())
 trainlines = '\n'.join([' '.join(list(Counter(freqs[nick]).elements())) for nick in nicks])


 word_vectorizer = CountVectorizer(analyzer='word',stop_words='english')
 trainset = word_vectorizer.fit_transform(trainlines.split('\n'))

 svc = svm.LinearSVC()
 svc.fit(trainset, nicks)

 def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names()
    topn = sorted(zip(classifier.coef_[labelid], feature_names), reverse=True)[:n]

    for coef, feat in topn:
        print('<{}>\t{}\t{}'.format(classlabel, feat, coef))


 for nick in nicks:
    most_informative_feature_for_class(word_vectorizer, svc, nick, 2)
    print
	#!/usr/bin/env python
	# coding: utf-8

	# Read in all the word frequencies by person.

	import re
	import sys
	from collections import defaultdict, Counter
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn import svm
	import numpy as np
	from itertools import chain
	import codecs
	import time


	filename = 'allmefitapes.txt'

	freqs = defaultdict(dict)


	# regexp to parse irclog lines
	r = re.compile(r'\d\d:\d\d <[@ ]?(.?)> (.*)', re.UNICODE)
	# regexp to clean lines into tokens
	tokenizer = re.compile('[\W_]+')


	# create the frequency table
	with open(filename) as infile:
	for line in infile:
	try:
	nick, text = r.search(line.rstrip()).group(1,2)
	nick = tokenizer.sub('', text.lower())
	text = tokenizer.sub(' ', text.lower())
	for word in text.split():
	try:
	freqs[nick][word] += 1
	except KeyError:
	freqs[nick][word] = 1
	except AttributeError:
	pass


	# Drop words used less than MINUSE times.
	MINUSE=50

	freqs = {person: {k: v for k,v in freqs[person].items() if v > MINUSE} for person in freqs.keys()}


	# sklearn expects a list of lines with words. There's almost certainly a better and more direct way to do this but this works.

	nicks = sorted(freqs.keys())
	trainlines = '\n'.join([' '.join(list(Counter(freqs[nick]).elements())) for nick in nicks])


	word_vectorizer = CountVectorizer(analyzer='word',stop_words='english')
	trainset = word_vectorizer.fit_transform(trainlines.split('\n'))

	svc = svm.LinearSVC()
	svc.fit(trainset, nicks)

	def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
	labelid = list(classifier.classes_).index(classlabel)
	feature_names = vectorizer.get_feature_names()
	topn = sorted(zip(classifier.coef_[labelid], feature_names), reverse=True)[:n]

	for coef, feat in topn:
	print('<{}>\t{}\t{}'.format(classlabel, feat, coef))


	for nick in nicks:
	most_informative_feature_for_class(word_vectorizer, svc, nick, 2)
	print
No results found