This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# cluster users based on feature vectors | |
import argparse | |
import io | |
import numpy as np | |
import pickle | |
import re | |
import string | |
import sys | |
from sklearn.cluster import MiniBatchKMeans |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Here are the steps I took to make Emacs have code-completion and PEP8 checking: | |
1. Install [Marmalade](http://marmalade-repo.org) by putting this in your .emacs | |
(require 'package) | |
(add-to-list 'package-archives | |
'("marmalade" . | |
"http://marmalade-repo.org/packages/")) | |
(package-initialize) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' An example of how to use Celery to manage a mix of serial and parallel | |
tasks. This depends on a running instance of a rabbitmq messaging server to | |
keep track of task statuses. This can be launched on our ec2 instance with: | |
~/rabbitmq/rabbitmq_server-3.1.3/sbin/rabbitmq-server | |
For this script to work, you first need to run a celery worker process to | |
await orders: | |
$ celery -A tasks worker --loglevel=info | |
Then, you can call any of the functions below (see main for an example). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn import linear_model | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import CountVectorizer | |
def print_features(coef, names): | |
""" Print sorted list of non-zero features/weights. """ | |
print "\n".join('%s/%.2f' % (names[j], coef[j]) for j in np.argsort(coef)[::-1] if coef[j] != 0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Example using GenSim's LDA and sklearn. """ | |
import numpy as np | |
from gensim import matutils | |
from gensim.models.ldamodel import LdaModel | |
from sklearn import linear_model | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import CountVectorizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# show that max_depth affects floating point precision of predict_proba in RandomForest | |
from collections import Counter | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.datasets import make_classification | |
X, y = make_classification(n_samples=1000, n_features=20, | |
n_informative=5, n_redundant=10, | |
random_state=42) |