Last active
January 30, 2016 07:30
-
-
Save zackmdavis/35010d361a4884534b03 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import logging | |
import bleach | |
import textblob | |
import gensim | |
from sklearn.decomposition import PCA | |
from numpy import dot | |
from numpy.linalg import norm | |
logging.basicConfig(level=logging.INFO) | |
def sentencize(comment): | |
content = textblob.TextBlob(bleach.clean(comment, strip=True, tags=[])) | |
for sentence in content.sentences: | |
yield list(sentence.words) | |
class SentenceIterable: | |
def __init__(self, limit=None): | |
self.limit = limit | |
def __iter__(self): | |
comment_count = 0 | |
with open('LWComments.csv') as lw_comments_csv: | |
comment_reader = csv.reader(lw_comments_csv) | |
# skip CSV header— | |
next(comment_reader) | |
# ['author', 'body', 'id', 'net_votes', 'time', | |
# 'url', 'vote_ratio', 'dum', 'avgKarma'] | |
for comment in comment_reader: | |
_author, body, _identifier, *_other_fields = comment | |
yield from sentencize(body) | |
comment_count += 1 | |
if comment_count % 1000 == 0: | |
logging.info("extracted sentences from %s comments so far", | |
comment_count) | |
if self.limit is not None: | |
if comment_count >= self.limit: | |
break | |
def build_model(limit=None): | |
return gensim.models.Word2Vec(SentenceIterable(limit=limit)) | |
def principal_components_analysis(model): | |
our_analysis = PCA() | |
our_analysis.fit(model.syn0) | |
return our_analysis | |
def principal_components(model): | |
our_analysis = principal_components_analysis(model) | |
return our_analysis.transform(model.syn0) | |
def word_spectra(model, how_many, real_word_threshold): | |
components = principal_components(model) | |
vocab = {word: info for word, info in model.vocab.items() | |
if info.count >= real_word_threshold} | |
return [sorted([(word, components[model.vocab[word].index][i]) | |
for word in vocab], key=lambda item: item[1]) | |
for i in range(how_many)] | |
def vector_project(v, onto): | |
return (dot(v, onto) / dot(onto, onto)) * onto | |
def scalar_project(v, onto): | |
return dot(v, onto) / norm(onto) | |
def bipolar_spectrum(model, one_word, another_word): | |
axis = model[another_word] - model[one_word] | |
projected_vocab = [ | |
(word, vector_project(model[word], onto=axis)) | |
for word in model.vocab] | |
return sorted(projected_vocab, | |
key=lambda v: scalar_project(v[1], axis)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment