Created
May 9, 2016 20:09
-
-
Save glamp/d8c1fc3ef48ca87f614aa8c2f284801d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import pandas as pd | |
import numpy as np | |
# get churn.csv from here: https://raw.githubusercontent.com/EricChiang/churn/master/data/churn.csv | |
churn_df = pd.read_csv('churn.csv') | |
col_names = churn_df.columns.tolist() | |
print "Column names:" | |
print col_names | |
to_show = col_names[:6] + col_names[-6:] | |
print "\nSample data:" | |
churn_df[to_show].head(6) | |
# Isolate target data | |
churn_result = churn_df['Churn?'] | |
y = np.where(churn_result == 'True.',1,0) | |
# We don't need these columns | |
to_drop = ['State','Area Code','Phone','Churn?'] | |
churn_feat_space = churn_df.drop(to_drop,axis=1) | |
# 'yes'/'no' has to be converted to boolean values | |
# NumPy converts these from boolean to 1. and 0. later | |
yes_no_cols = ["Int'l Plan","VMail Plan"] | |
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes' | |
# Pull out features for future use | |
features = churn_feat_space.columns | |
X = churn_feat_space.as_matrix().astype(np.float) | |
# This is important | |
from sklearn.preprocessing import StandardScaler | |
scaler = StandardScaler() | |
X = scaler.fit_transform(X) | |
print "Feature space holds %d observations and %d features" % X.shape | |
print "Unique target labels:", np.unique(y) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pylab as pl | |
x = np.random.uniform(1, 100, 1000) | |
y = np.log(x) + np.random.normal(0, .3, 1000) | |
pl.scatter(x, y, s=1, label="log(x) with noise") | |
pl.plot(np.arange(1, 100), np.log(np.arange(1, 100)), c="b", label="log(x) true function") | |
pl.xlabel("x") | |
pl.ylabel("f(x) = log(x)") | |
pl.legend(loc="best") | |
pl.title("A Basic Log Function") | |
pl.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.datasets import load_iris | |
from sklearn.ensemble import RandomForestClassifier | |
import pandas as pd | |
import numpy as np | |
iris = load_iris() | |
df = pd.DataFrame(iris.data, columns=iris.feature_names) | |
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 | |
df['species'] = pd.Factor(iris.target, iris.target_names) | |
df.head() | |
train, test = df[df['is_train']==True], df[df['is_train']==False] | |
features = df.columns[:4] | |
clf = RandomForestClassifier(n_jobs=2) | |
y, _ = pd.factorize(train['species']) | |
clf.fit(train[features], y) | |
preds = iris.target_names[clf.predict(test[features])] | |
pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from ggplot import * | |
meat = meat.dropna(thresh=800, axis=1) # drop columns that have fewer than 800 observations | |
ts = meat.set_index(['date']) | |
print ts.groupby(ts.index.year).sum().head(10) | |
the1940s = ts.groupby(ts.index.year).sum().ix['1940-01-01':'1949-12-31'] | |
the1940s | |
def floor_decade(date_value): | |
"Takes a date. Returns the decade." | |
return (date_value.year // 10) * 10 | |
pd.to_datetime('2013-10-09') | |
floor_decade(_) | |
ts.groupby(floor_decade).sum() | |
the1940s.sum().reset_index(name='meat sums in the 1940s') | |
by_decade = ts.groupby(floor_decade).sum() | |
by_decade.index.name = 'year' | |
by_decade = by_decade.reset_index() | |
print ggplot(by_decade, aes('year', weight='beef')) + \ | |
geom_bar() + \ | |
scale_y_continuous(labels='comma') + \ | |
ggtitle('Head of Cattle Slaughtered by Decade') | |
by_decade_long = pd.melt(by_decade, id_vars="year") | |
print ggplot(aes(x='year', weight='value', colour='variable'), data=by_decade_long) + \ | |
geom_bar() + \ | |
ggtitle("Meat Production by Decade") | |
from ggplot import meat | |
meat_lng = pd.melt(meat, id_vars=['date']) | |
print ggplot(aes(x='date', y='value', colour='variable'), data=meat_lng) + geom_line() | |
print ggplot(aes(x='date', y='value', colour='variable'), data=meat_lng) + \ | |
stat_smooth(span=0.10) + \ | |
ggtitle("Smoothed Livestock Production") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://raw.githubusercontent.com/EricChiang/churn/master/data/churn.csv is a broken link. Do you have a live link to the data?