Skip to content

Instantly share code, notes, and snippets.

@suzaku
Last active December 10, 2015 00:19
Show Gist options
  • Save suzaku/4350410 to your computer and use it in GitHub Desktop.
Save suzaku/4350410 to your computer and use it in GitHub Desktop.
"""Compute mean and variance to normalize data"""
import dpark
import plac
import numpy as np
def prepare(s):
return np.array([float(i.strip()) for i in s.split(",")])
def main(filename):
lines = dpark.textFile(filename)
n = lines.count()
matrix = lines.map(prepare)
data = matrix.map(lambda vs: np.array([vs, vs ** 2]))
rows_sum, squared_sum = data.reduce(lambda X, Y: X + Y)
means = rows_sum / n
print "Mean: %s" % means
variance = np.sqrt((squared_sum - n * (means ** 2)) / n)
print "Variance: %s" % variance
normalized = matrix.map(lambda X: (X - means) / variance)
for r in normalized.collect():
print r
if __name__ == "__main__":
plac.call(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment