glamp · May 9, 2016 20:09 · bdoohan · Apr 9, 2018
diff --git a/churn.py b/churn.py
 from __future__ import division
 import pandas as pd
 import numpy as np

 # get churn.csv from here: https://raw.githubusercontent.com/EricChiang/churn/master/data/churn.csv
 churn_df = pd.read_csv('churn.csv')
 col_names = churn_df.columns.tolist()

 print "Column names:"
 print col_names

 to_show = col_names[:6] + col_names[-6:]

 print "\nSample data:"
 churn_df[to_show].head(6)
      


 # Isolate target data
 churn_result = churn_df['Churn?']
 y = np.where(churn_result == 'True.',1,0)

 # We don't need these columns
 to_drop = ['State','Area Code','Phone','Churn?']
 churn_feat_space = churn_df.drop(to_drop,axis=1)

 # 'yes'/'no' has to be converted to boolean values
 # NumPy converts these from boolean to 1. and 0. later
 yes_no_cols = ["Int'l Plan","VMail Plan"]
 churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

 # Pull out features for future use
 features = churn_feat_space.columns

 X = churn_feat_space.as_matrix().astype(np.float)

 # This is important
 from sklearn.preprocessing import StandardScaler
 scaler = StandardScaler()
 X = scaler.fit_transform(X)

 print "Feature space holds %d observations and %d features" % X.shape
 print "Unique target labels:", np.unique(y)
      
diff --git a/log_func.py b/log_func.py
 import numpy as np
 import pylab as pl

 x = np.random.uniform(1, 100, 1000)
 y = np.log(x) + np.random.normal(0, .3, 1000)

 pl.scatter(x, y, s=1, label="log(x) with noise")
 pl.plot(np.arange(1, 100), np.log(np.arange(1, 100)), c="b", label="log(x) true function")
 pl.xlabel("x")
 pl.ylabel("f(x) = log(x)")
 pl.legend(loc="best")
 pl.title("A Basic Log Function")
 pl.show()
diff --git a/rf.py b/rf.py
 from sklearn.datasets import load_iris
 from sklearn.ensemble import RandomForestClassifier
 import pandas as pd
 import numpy as np

 iris = load_iris()
 df = pd.DataFrame(iris.data, columns=iris.feature_names)
 df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
 df['species'] = pd.Factor(iris.target, iris.target_names)
 df.head()

 train, test = df[df['is_train']==True], df[df['is_train']==False]

 features = df.columns[:4]
 clf = RandomForestClassifier(n_jobs=2)
 y, _ = pd.factorize(train['species'])
 clf.fit(train[features], y)

 preds = iris.target_names[clf.predict(test[features])]
 pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])
diff --git a/time-series-agg.py b/time-series-agg.py
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 from ggplot import *

 meat = meat.dropna(thresh=800, axis=1) # drop columns that have fewer than 800 observations
 ts = meat.set_index(['date'])

 print ts.groupby(ts.index.year).sum().head(10)

 the1940s = ts.groupby(ts.index.year).sum().ix['1940-01-01':'1949-12-31']
 the1940s

 def floor_decade(date_value):
    "Takes a date. Returns the decade."
    return (date_value.year // 10) * 10

 pd.to_datetime('2013-10-09')
 floor_decade(_)

 ts.groupby(floor_decade).sum()

 the1940s.sum().reset_index(name='meat sums in the 1940s')

 by_decade = ts.groupby(floor_decade).sum()

 by_decade.index.name = 'year'

 by_decade = by_decade.reset_index()

 print ggplot(by_decade, aes('year', weight='beef')) + \
    geom_bar() + \
    scale_y_continuous(labels='comma') + \
    ggtitle('Head of Cattle Slaughtered by Decade')


 by_decade_long = pd.melt(by_decade, id_vars="year")

 print ggplot(aes(x='year', weight='value', colour='variable'), data=by_decade_long) + \
 geom_bar() + \
 ggtitle("Meat Production by Decade")


 from ggplot import meat
 meat_lng = pd.melt(meat, id_vars=['date'])
 print ggplot(aes(x='date', y='value', colour='variable'), data=meat_lng) + geom_line()

 print ggplot(aes(x='date', y='value', colour='variable'), data=meat_lng) + \
    stat_smooth(span=0.10) + \
    ggtitle("Smoothed Livestock Production")
	from __future__ import division
	import pandas as pd
	import numpy as np

	# get churn.csv from here: https://raw.githubusercontent.com/EricChiang/churn/master/data/churn.csv
	churn_df = pd.read_csv('churn.csv')
	col_names = churn_df.columns.tolist()

	print "Column names:"
	print col_names

	to_show = col_names[:6] + col_names[-6:]

	print "\nSample data:"
	churn_df[to_show].head(6)



	# Isolate target data
	churn_result = churn_df['Churn?']
	y = np.where(churn_result == 'True.',1,0)

	# We don't need these columns
	to_drop = ['State','Area Code','Phone','Churn?']
	churn_feat_space = churn_df.drop(to_drop,axis=1)

	# 'yes'/'no' has to be converted to boolean values
	# NumPy converts these from boolean to 1. and 0. later
	yes_no_cols = ["Int'l Plan","VMail Plan"]
	churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

	# Pull out features for future use
	features = churn_feat_space.columns

	X = churn_feat_space.as_matrix().astype(np.float)

	# This is important
	from sklearn.preprocessing import StandardScaler
	scaler = StandardScaler()
	X = scaler.fit_transform(X)

	print "Feature space holds %d observations and %d features" % X.shape
	print "Unique target labels:", np.unique(y)
	import numpy as np
	import pylab as pl

	x = np.random.uniform(1, 100, 1000)
	y = np.log(x) + np.random.normal(0, .3, 1000)

	pl.scatter(x, y, s=1, label="log(x) with noise")
	pl.plot(np.arange(1, 100), np.log(np.arange(1, 100)), c="b", label="log(x) true function")
	pl.xlabel("x")
	pl.ylabel("f(x) = log(x)")
	pl.legend(loc="best")
	pl.title("A Basic Log Function")
	pl.show()
	from sklearn.datasets import load_iris
	from sklearn.ensemble import RandomForestClassifier
	import pandas as pd
	import numpy as np

	iris = load_iris()
	df = pd.DataFrame(iris.data, columns=iris.feature_names)
	df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
	df['species'] = pd.Factor(iris.target, iris.target_names)
	df.head()

	train, test = df[df['is_train']==True], df[df['is_train']==False]

	features = df.columns[:4]
	clf = RandomForestClassifier(n_jobs=2)
	y, _ = pd.factorize(train['species'])
	clf.fit(train[features], y)

	preds = iris.target_names[clf.predict(test[features])]
	pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from ggplot import *

	meat = meat.dropna(thresh=800, axis=1) # drop columns that have fewer than 800 observations
	ts = meat.set_index(['date'])

	print ts.groupby(ts.index.year).sum().head(10)

	the1940s = ts.groupby(ts.index.year).sum().ix['1940-01-01':'1949-12-31']
	the1940s

	def floor_decade(date_value):
	"Takes a date. Returns the decade."
	return (date_value.year // 10) * 10

	pd.to_datetime('2013-10-09')
	floor_decade(_)

	ts.groupby(floor_decade).sum()

	the1940s.sum().reset_index(name='meat sums in the 1940s')

	by_decade = ts.groupby(floor_decade).sum()

	by_decade.index.name = 'year'

	by_decade = by_decade.reset_index()

	print ggplot(by_decade, aes('year', weight='beef')) + \
	geom_bar() + \
	scale_y_continuous(labels='comma') + \
	ggtitle('Head of Cattle Slaughtered by Decade')


	by_decade_long = pd.melt(by_decade, id_vars="year")

	print ggplot(aes(x='year', weight='value', colour='variable'), data=by_decade_long) + \
	geom_bar() + \
	ggtitle("Meat Production by Decade")


	from ggplot import meat
	meat_lng = pd.melt(meat, id_vars=['date'])
	print ggplot(aes(x='date', y='value', colour='variable'), data=meat_lng) + geom_line()

	print ggplot(aes(x='date', y='value', colour='variable'), data=meat_lng) + \
	stat_smooth(span=0.10) + \
	ggtitle("Smoothed Livestock Production")