A personal diary of DataFrame munging over the years.
Convert Series datatype to numeric (will error if column has non-numeric values)
(h/t @makmanalp)
from sklearn.datasets import fetch_20newsgroups, load_digits | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cross_validation import train_test_split | |
import numpy as np | |
from sklearn.naive_bayes import MultinomialNB, BernoulliNB | |
from sklearn.linear_model import LogisticRegression, SGDClassifier | |
from sklearn import metrics | |
newsgroups_train = fetch_20newsgroups(subset='train') | |
vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000) |
""" | |
A deep neural network with or w/o dropout in one file. | |
""" | |
import numpy | |
import theano | |
import sys | |
import math | |
from theano import tensor as T | |
from theano import shared |
A personal diary of DataFrame munging over the years.
Convert Series datatype to numeric (will error if column has non-numeric values)
(h/t @makmanalp)
""" | |
A deep neural network with or w/o dropout in one file. | |
License: Do What The Fuck You Want to Public License http://www.wtfpl.net/ | |
""" | |
import numpy, theano, sys, math | |
from theano import tensor as T | |
from theano import shared | |
from theano.tensor.shared_randomstreams import RandomStreams |
For the Forest Cover Type Prediction competition on Kaggle, the goal is to predict the predominant type of trees in a given section of forest. The score is based on average classification accuracy for the 7 different tree cover classes.
To beat the all fir/spruce benchmark I obviously tried a random forest. Using the default settings of scikit-learn's RandomForestClassifier, I was able to beat the benchmark with an accuracy score of 0.72718 on the competition leaderboard. By using 100 estimators (versus the default of 10), I was able to raise that accuracy score up to 0.75455.
Using pandas I loaded the train and test data sets into Python.
import numpy as np | |
#from scipy.special import chdtrc | |
from scipy.sparse import spdiags | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.preprocessing import LabelBinarizer | |
def _chisquare(f_obs, f_exp, reduce): | |
"""Replacement for scipy.stats.chisquare with custom reduction. |
import seaborn as sns | |
from scipy.optimize import curve_fit | |
# Function for linear fit | |
def func(x, a, b): | |
return a + b * x | |
# Seaborn conveniently provides the data for | |
# Anscombe's quartet. | |
df = sns.load_dataset("anscombe") |
# Alec Radford, Indico, Kyle Kastner | |
# License: MIT | |
""" | |
Convolutional VAE in a single file. | |
Bringing in code from IndicoDataSolutions and Alec Radford (NewMu) | |
Additionally converted to use default conv2d interface instead of explicit cuDNN | |
""" | |
import theano | |
import theano.tensor as T | |
from theano.compat.python2x import OrderedDict |
""" | |
preprocess-twitter.py | |
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)" | |
Script for preprocessing tweets by Romain Paulus | |
with small modifications by Jeffrey Pennington | |
with translation to Python by Motoki Wu | |
Translation of Ruby script to create features for GloVe vectors for Twitter data. |
from lasagne.layers import Layer | |
class HighwayLayer(Layer): | |
def __init__(self, incoming, layer_class, gate_nonlinearity=None, | |
**kwargs): | |
super(HighwayLayer, self).__init__(incoming) | |
self.H_layer = layer_class(incoming, **kwargs) | |
if gate_nonlinearity: |