Created
April 22, 2014 02:14
-
-
Save eob/11163203 to your computer and use it in GitHub Desktop.
Polynomial Fitting
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.optimize import curve_fit | |
from collections import Counter, defaultdict | |
import numpy as np | |
import time | |
def f(xss, a, b, c, d, e, f): | |
""" | |
Polynomial function up to order 5 | |
""" | |
ret = [] | |
for x in xss: | |
y = 0 | |
y = a + b*x + c*x**2 + d*x**3 + e*x**4 + f*x**5 | |
ret.append(y) | |
return ret | |
def super_f(xss, coeffs): | |
ret = [] | |
for x in xss: | |
y = 0 | |
for (idx, coeff) in enumerate(coeffs): | |
y += coeff * (x**idx) | |
ret.append(y) | |
return ret | |
def mse(xs, ys): | |
if len(xs) == 0: | |
print "HEY! Trying to mse empty list, you fool!" | |
return 0 | |
ret = 0 | |
for (x,y) in zip(xs, ys): | |
ret += (x-y)*(x-y) | |
ret /= len(xs) | |
def mse_at_order(xs, ys, degree, folds, train_percent): | |
""" | |
Returns the average mean squared error | |
""" | |
errors = [] | |
for fold in range(len(folds)): | |
# Todo: shuffle and divide | |
x_copy = xs[:] | |
y_copy = ys[:] | |
shuffle(x_copy) | |
shuffle(y_copy) | |
train_percent * len(x_copy) | |
xs_train = xs | |
ys_train = ys | |
xs_test = xs | |
ys_test = ys | |
tup = np.polyfit(xs_train, ys_train, degree, full=True) | |
coeffs = tup[0] | |
estimated_costs = super_f(xs_test, coeffs) | |
error = mse(estimated_costs, ys_test) | |
errors.append(error) | |
return np.mean(errors) | |
def learn_polynomial(feature, datapoints, maxpersize=-1): | |
""" | |
Given a feature and a set of different _sized_ datapoints | |
learn the extraction-cost polynomial | |
Args: | |
datapoints: list of datapoint objects that have a .size attr | |
maxpersize: max number of extractions for a given data point size | |
""" | |
vals = defaultdict(list) | |
for datapoint in datapoints: | |
x = datapoint.size | |
if maxpersize == -1: | |
if len(vals[x]) > 10: continue | |
start = time.time() | |
feature.extract(datapoint) | |
stop = time.time() | |
y = stop - start | |
vals[x].append(y) | |
# xs -> sizes, ys -> avg cost for size | |
xs = np.array(vals.keys()) | |
ys = np.array([np.median(vals[x]) for x in xs]) | |
# try to use lower degrees if possible | |
cost_per_coeffs = [] | |
deg = 0 | |
while deg < 6: | |
tup = np.polyfit(xs, ys, deg, full=True) | |
coeffs = tup[0] | |
res = tup[1] | |
cost_per_coeffs.append((sum(res) / (.95 ** deg), coeffs)) | |
if abs(sum(res)) <= 1e-4: | |
break | |
deg += 1 | |
# Take the minimum cost list of coeffs. | |
# Min of a list of tuples uses the first slow. | |
coeffs = list(min(cost_per_coeffs)[1]) | |
coeffs.reverse() | |
# Plotting | |
# ----------------------------------------------------- | |
if False: | |
allxs, allys = [], [] | |
from util import * | |
poly = create_polynomial(coeffs) | |
from matplotlib import pyplot as plt | |
for key, vs in vals.iteritems(): | |
for v in vs: | |
allxs.append(key) | |
allys.append(v) | |
plt.clf() | |
plt.scatter(allxs, allys, color='red', alpha=0.5, lw=0) | |
plt.plot(xs, ys, color='red', alpha=0.4) | |
plt.plot(xs, map(poly, xs), color='green', alpha=0.4) | |
plt.show() | |
import pdb | |
pdb.set_trace() | |
return coeffs | |
# ignoreme | |
#popt, pcov = curve_fit(f, xs, ys) | |
#return map(float, popt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment