Skip to content

Instantly share code, notes, and snippets.

@suzaku
Created December 21, 2012 02:23
Show Gist options
  • Save suzaku/4350271 to your computer and use it in GitHub Desktop.
Save suzaku/4350271 to your computer and use it in GitHub Desktop.
normalize data with dpark
"""Compute mean and variance to normalize data"""
import dpark
import plac
import math
def prepare(s):
return [float(i.strip()) for i in s.split(",")]
def add_squared_list(vals):
squared_vals = [v * v for v in vals]
return vals, squared_vals
def row_add(X, Y):
return [x + y for x, y in zip(X, Y)]
def main(filename):
lines = dpark.textFile(filename)
n = lines.count()
matrix = lines.map(prepare)
data = matrix.map(add_squared_list)
rows_sum, squared_sum = data.reduce(lambda (X1, X2), (Y1, Y2): (row_add(X1, Y1),
row_add(X2, Y2)))
means = [x / n for x in rows_sum]
print "Mean: %s" % means
variance = [math.sqrt((s - n * m * m) / n) for s, m in zip(squared_sum, means)]
print "Variance: %s" % variance
normalized = matrix.map(lambda X: [(x - m) / v for x, m, v in zip(X, means, variance)])
for r in normalized.collect():
print r
if __name__ == "__main__":
plac.call(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment