Skip to content

Instantly share code, notes, and snippets.

View glouppe's full-sized avatar

Gilles Louppe glouppe

View GitHub Profile
import numpy
import random
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
# Define training and testing sets
inds = numpy.arange(len(mnist.data))
test_i = random.sample(xrange(len(inds)), int(0.1*len(inds)))
train_i = numpy.delete(inds, test_i)
X_train = mnist.data[train_i].astype(numpy.double)
y_train = mnist.target[train_i].astype(numpy.double)
@glouppe
glouppe / nearest_developers.py
Last active December 23, 2015 21:39
Generate a sparse matrix such that rows=users, columns=filenames and data[i, j]=number of commits of user i on file j, and then find the 3 nearest neighbors of each scikit-learn contributor.
import numpy as np
import os
from collections import defaultdict
from git import Repo
from scipy.sparse import csc_matrix
path = "/home/gilles/Sources/scikit-learn/sklearn/"
extensions = ["py", "pyx", "pxd"]
@glouppe
glouppe / beard_disambiguation.py
Created January 7, 2015 15:49
Disambiguation prototype
import numpy as np
import argparse
import cPickle
import scipy.cluster.hierarchy as hac
from itertools import groupby
from itertools import product
from scipy.sparse import lil_matrix
from scipy.sparse import issparse
from scipy.spatial.distance import squareform
import sys
sys.path.append("/usr/lib/python2.7/dist-packages/")
sys.path.append("/usr/local/lib/python2.7/dist-packages/")
import string
import re
from joblib import Parallel, delayed
from invenio.dbquery import run_sql
from invenio.bibauthorid_dbinterface import get_title_of_paper
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.datasets import fetch_mldata
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from rep.estimators import TMVAClassifier
from functools import partial
import multiprocessing as mp
import queue
import threading
def buffered_gen_mp(source_gen, buffer_size=2):
"""
Generator that runs a slow source generator in a separate process.
buffer_size: the maximal number of items to pre-generate (length of the buffer)
"""
if buffer_size < 2:
We can't make this file beautiful and searchable because it's too large.
SMILES,CHEM_ID
COC1:C:C(C2C3=C(CCCC3=O)N(C3:C:C:C(C):C:C:3)C3=C2C(=O)CCC3):C([N+](=O)[O-]):C:C:1OC,Chem_1
O=C1NC(N2CCCCC2)=NC1=CC1:C:C:C:S:1,Chem_2
COC1:C:C(C=C2C(=O)N(C(=O)C3:C:C:C(Cl):C:C:3)N=C2C):C:C(OC):C:1OC,Chem_3
CC#CC(O)(C(=O)OC1CCN(C)CC1)C1CCCCC1,Chem_4
COC1:C:C:C(N=C(C)C(C)=NC2:C:C:C(OC):C:C:2):C:C:1,Chem_5
CSC1:N:C(O):C(C#N):C(C2:C:C:C(C):C:C:2):N:1,Chem_6
CSC1:N:C(C2:C:C:C:C:C:2):N:C(N2CCOCC2):[S+]:1.[IH2+],Chem_7
CC1:C:C:C(C=C2N=C(NN=CC(O)C(O)C(O)CO)NC2=O):C:C:1,Chem_8
CN(C)C(C1=C(O)C(C2:C:C:C:C:C:2)N(C2:C:C:C:C:C:2)C1=O)N1CCOCC1,Chem_9