Skip to content

Instantly share code, notes, and snippets.

@glamp
Created May 9, 2016 20:09
Show Gist options
  • Save glamp/d8c1fc3ef48ca87f614aa8c2f284801d to your computer and use it in GitHub Desktop.
Save glamp/d8c1fc3ef48ca87f614aa8c2f284801d to your computer and use it in GitHub Desktop.
from __future__ import division
import pandas as pd
import numpy as np
# get churn.csv from here: https://raw.githubusercontent.com/EricChiang/churn/master/data/churn.csv
churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist()
print "Column names:"
print col_names
to_show = col_names[:6] + col_names[-6:]
print "\nSample data:"
churn_df[to_show].head(6)
# Isolate target data
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)
# We don't need these columns
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)
# 'yes'/'no' has to be converted to boolean values
# NumPy converts these from boolean to 1. and 0. later
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
# Pull out features for future use
features = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)
# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)
import numpy as np
import pylab as pl
x = np.random.uniform(1, 100, 1000)
y = np.log(x) + np.random.normal(0, .3, 1000)
pl.scatter(x, y, s=1, label="log(x) with noise")
pl.plot(np.arange(1, 100), np.log(np.arange(1, 100)), c="b", label="log(x) true function")
pl.xlabel("x")
pl.ylabel("f(x) = log(x)")
pl.legend(loc="best")
pl.title("A Basic Log Function")
pl.show()
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
df['species'] = pd.Factor(iris.target, iris.target_names)
df.head()
train, test = df[df['is_train']==True], df[df['is_train']==False]
features = df.columns[:4]
clf = RandomForestClassifier(n_jobs=2)
y, _ = pd.factorize(train['species'])
clf.fit(train[features], y)
preds = iris.target_names[clf.predict(test[features])]
pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ggplot import *
meat = meat.dropna(thresh=800, axis=1) # drop columns that have fewer than 800 observations
ts = meat.set_index(['date'])
print ts.groupby(ts.index.year).sum().head(10)
the1940s = ts.groupby(ts.index.year).sum().ix['1940-01-01':'1949-12-31']
the1940s
def floor_decade(date_value):
"Takes a date. Returns the decade."
return (date_value.year // 10) * 10
pd.to_datetime('2013-10-09')
floor_decade(_)
ts.groupby(floor_decade).sum()
the1940s.sum().reset_index(name='meat sums in the 1940s')
by_decade = ts.groupby(floor_decade).sum()
by_decade.index.name = 'year'
by_decade = by_decade.reset_index()
print ggplot(by_decade, aes('year', weight='beef')) + \
geom_bar() + \
scale_y_continuous(labels='comma') + \
ggtitle('Head of Cattle Slaughtered by Decade')
by_decade_long = pd.melt(by_decade, id_vars="year")
print ggplot(aes(x='year', weight='value', colour='variable'), data=by_decade_long) + \
geom_bar() + \
ggtitle("Meat Production by Decade")
from ggplot import meat
meat_lng = pd.melt(meat, id_vars=['date'])
print ggplot(aes(x='date', y='value', colour='variable'), data=meat_lng) + geom_line()
print ggplot(aes(x='date', y='value', colour='variable'), data=meat_lng) + \
stat_smooth(span=0.10) + \
ggtitle("Smoothed Livestock Production")
@bdoohan
Copy link

bdoohan commented Apr 9, 2018

https://raw.githubusercontent.com/EricChiang/churn/master/data/churn.csv is a broken link. Do you have a live link to the data?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment