eob · April 22, 2014 02:14
diff --git a/Polynomial Fitting b/Polynomial Fitting
 from scipy.optimize import curve_fit
 from collections import Counter, defaultdict
 import numpy as np
 import time

 def f(xss, a, b, c, d, e, f):
  """
  Polynomial function up to order 5
  """
  ret = []
  for x in xss:
    y = 0
    y = a + b*x + c*x**2 + d*x**3 + e*x**4 + f*x**5
    ret.append(y)
  return ret

 def super_f(xss, coeffs):
  ret = []
  for x in xss:
    y = 0
    for (idx, coeff) in enumerate(coeffs):
      y += coeff * (x**idx)
    ret.append(y)
  return ret

 def mse(xs, ys):
  if len(xs) == 0:
    print "HEY! Trying to mse empty list, you fool!"
    return 0
  ret = 0
  for (x,y) in zip(xs, ys):
    ret += (x-y)*(x-y)
  ret /= len(xs)

 def mse_at_order(xs, ys, degree, folds, train_percent):
  """
  Returns the average mean squared error
  """
  errors = []
  for fold in range(len(folds)):
    # Todo: shuffle and divide
    x_copy = xs[:]
    y_copy = ys[:]
    shuffle(x_copy)
    shuffle(y_copy)
    train_percent * len(x_copy)

    xs_train = xs
    ys_train = ys
    xs_test = xs
    ys_test = ys
    tup = np.polyfit(xs_train, ys_train, degree, full=True)
    coeffs = tup[0]
    estimated_costs = super_f(xs_test, coeffs)
    error = mse(estimated_costs, ys_test)
    errors.append(error)
  return np.mean(errors)

 def learn_polynomial(feature, datapoints, maxpersize=-1):
  """
  Given a feature and a set of different _sized_ datapoints
  learn the extraction-cost polynomial

  Args:
    datapoints: list of datapoint objects that have a .size attr
    maxpersize: max number of extractions for a given data point size
  """
  vals = defaultdict(list)

  for datapoint in datapoints:
    x = datapoint.size
    if maxpersize == -1:
      if len(vals[x]) > 10: continue

    start = time.time()
    feature.extract(datapoint)
    stop = time.time()
    y = stop - start
    vals[x].append(y)

  # xs -> sizes, ys -> avg cost for size
  xs = np.array(vals.keys())
  ys = np.array([np.median(vals[x]) for x in xs])


  # try to use lower degrees if possible
  cost_per_coeffs = []
  deg = 0
  while deg < 6:
    tup = np.polyfit(xs, ys, deg, full=True)
    coeffs = tup[0]
    res = tup[1]
    cost_per_coeffs.append((sum(res) / (.95 ** deg), coeffs))
    if abs(sum(res)) <= 1e-4:
      break
    deg += 1
  # Take the minimum cost list of coeffs.
  # Min of a list of tuples uses the first slow.
  coeffs = list(min(cost_per_coeffs)[1])
  coeffs.reverse()

  # Plotting
  # -----------------------------------------------------
  if False:
    allxs, allys = [], []
    from util import *
    poly = create_polynomial(coeffs)
    from  matplotlib import pyplot as plt
    for key, vs in vals.iteritems():
      for v in vs:
        allxs.append(key)
        allys.append(v)
    plt.clf()
    plt.scatter(allxs, allys, color='red', alpha=0.5, lw=0)
    plt.plot(xs, ys, color='red', alpha=0.4)
    plt.plot(xs, map(poly, xs), color='green', alpha=0.4)
    plt.show()
    import pdb
    pdb.set_trace()

  return coeffs
  # ignoreme
  #popt, pcov = curve_fit(f, xs, ys)
  #return map(float, popt)
	from scipy.optimize import curve_fit
	from collections import Counter, defaultdict
	import numpy as np
	import time

	def f(xss, a, b, c, d, e, f):
	"""
	Polynomial function up to order 5
	"""
	ret = []
	for x in xss:
	y = 0
	y = a + bx + cx*2 + dx*3 + ex*4 + fx**5
	ret.append(y)
	return ret

	def super_f(xss, coeffs):
	ret = []
	for x in xss:
	y = 0
	for (idx, coeff) in enumerate(coeffs):
	y += coeff * (x**idx)
	ret.append(y)
	return ret

	def mse(xs, ys):
	if len(xs) == 0:
	print "HEY! Trying to mse empty list, you fool!"
	return 0
	ret = 0
	for (x,y) in zip(xs, ys):
	ret += (x-y)*(x-y)
	ret /= len(xs)

	def mse_at_order(xs, ys, degree, folds, train_percent):
	"""
	Returns the average mean squared error
	"""
	errors = []
	for fold in range(len(folds)):
	# Todo: shuffle and divide
	x_copy = xs[:]
	y_copy = ys[:]
	shuffle(x_copy)
	shuffle(y_copy)
	train_percent * len(x_copy)

	xs_train = xs
	ys_train = ys
	xs_test = xs
	ys_test = ys
	tup = np.polyfit(xs_train, ys_train, degree, full=True)
	coeffs = tup[0]
	estimated_costs = super_f(xs_test, coeffs)
	error = mse(estimated_costs, ys_test)
	errors.append(error)
	return np.mean(errors)

	def learn_polynomial(feature, datapoints, maxpersize=-1):
	"""
	Given a feature and a set of different _sized_ datapoints
	learn the extraction-cost polynomial

	Args:
	datapoints: list of datapoint objects that have a .size attr
	maxpersize: max number of extractions for a given data point size
	"""
	vals = defaultdict(list)

	for datapoint in datapoints:
	x = datapoint.size
	if maxpersize == -1:
	if len(vals[x]) > 10: continue

	start = time.time()
	feature.extract(datapoint)
	stop = time.time()
	y = stop - start
	vals[x].append(y)

	# xs -> sizes, ys -> avg cost for size
	xs = np.array(vals.keys())
	ys = np.array([np.median(vals[x]) for x in xs])


	# try to use lower degrees if possible
	cost_per_coeffs = []
	deg = 0
	while deg < 6:
	tup = np.polyfit(xs, ys, deg, full=True)
	coeffs = tup[0]
	res = tup[1]
	cost_per_coeffs.append((sum(res) / (.95 ** deg), coeffs))
	if abs(sum(res)) <= 1e-4:
	break
	deg += 1
	# Take the minimum cost list of coeffs.
	# Min of a list of tuples uses the first slow.
	coeffs = list(min(cost_per_coeffs)[1])
	coeffs.reverse()

	# Plotting
	# -----------------------------------------------------
	if False:
	allxs, allys = [], []
	from util import *
	poly = create_polynomial(coeffs)
	from matplotlib import pyplot as plt
	for key, vs in vals.iteritems():
	for v in vs:
	allxs.append(key)
	allys.append(v)
	plt.clf()
	plt.scatter(allxs, allys, color='red', alpha=0.5, lw=0)
	plt.plot(xs, ys, color='red', alpha=0.4)
	plt.plot(xs, map(poly, xs), color='green', alpha=0.4)
	plt.show()
	import pdb
	pdb.set_trace()

	return coeffs
	# ignoreme
	#popt, pcov = curve_fit(f, xs, ys)
	#return map(float, popt)