Created
February 23, 2012 14:43
-
-
Save fcostin/1893136 to your computer and use it in GitHub Desktop.
test_ridge_crime.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CRIME_DATA_URL := http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data | |
CRIME_DATA := crime.data | |
test: $(CRIME_DATA) | |
time python test_ridge_crime.py $^ | |
.PHONY: test | |
$(CRIME_DATA): | |
wget -O $@ $(CRIME_DATA_URL) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
import numpy | |
numpy.random.seed(12345) | |
def parse_value(x): | |
try: | |
return float(x) | |
except ValueError: | |
return numpy.nan | |
def get_crime_data(lines): | |
reader = csv.reader(lines) | |
cols = numpy.asarray(list(reader)).T | |
cols = cols[5:, :] | |
return map(lambda col : map(parse_value, col), cols) | |
def fill_missing_values(cols): | |
cols = numpy.asarray(cols) | |
imputed_cols = [] | |
for i, col in enumerate(cols): | |
mask = numpy.isfinite(col) | |
fill_value = numpy.median(col[mask]) | |
imputed_col = numpy.array(col) | |
imputed_col[numpy.logical_not(mask)] = fill_value | |
imputed_cols.append(imputed_col) | |
return numpy.vstack(imputed_cols) | |
def main(): | |
if len(sys.argv) != 2: | |
print 'usage: data.csv' | |
sys.exit(1) | |
print 'reading data' | |
with open(sys.argv[1]) as f: | |
cols = get_crime_data(f) | |
print 'filling missing values' | |
filled_cols = fill_missing_values(cols) | |
data = filled_cols.T | |
print 'preparing ridge regression' | |
from sklearn.linear_model.ridge import RidgeCV | |
from sklearn import cross_validation | |
learner = RidgeCV( | |
alphas = 2 ** numpy.linspace(-10, 11, 21), | |
fit_intercept = True, | |
normalize = False | |
) | |
n_examples = data.shape[0] | |
# splits = cross_validation.ShuffleSplit(n_examples, n_iterations = 10, | |
# test_fraction = 0.8, indices = True, random_state = 12345) | |
splits = [(range(int(n_examples * 0.2)), range(int(n_examples * 0.2), n_examples))] | |
for i, (train_indices, test_indices) in enumerate(splits): | |
print 'split %d' % i | |
train = data[train_indices, :] | |
test = data[test_indices, :] | |
print '\ttraining examples: %d' % train.shape[0] | |
print '\ttesting examples: %d' % test.shape[0] | |
x_train = train[:, :-1] | |
y_train = train[:, -1] | |
print '\tfitting ridge model' | |
fit = learner.fit(x_train, y_train) | |
x_test = test[:, :-1] | |
y_test = test[:, -1] | |
print '\t\tbest_alpha = %e' % fit.best_alpha | |
print '\tmaking predictions' | |
predictions = fit.predict(x_test) | |
mse = numpy.mean((predictions - y_test) ** 2) | |
print '\ttest mse = %e' % mse | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment