Skip to content

Instantly share code, notes, and snippets.

@kkleidal
Last active November 25, 2016 04:01
Show Gist options
  • Save kkleidal/e8635593af3277af82c2ea5cf1a539f6 to your computer and use it in GitHub Desktop.
Save kkleidal/e8635593af3277af82c2ea5cf1a539f6 to your computer and use it in GitHub Desktop.
For executing long-running ML jobs
import gensim
from job import Job
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# Load Google's pre-trained Word2Vec model.
# model = gensim.models.Word2Vec.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)
def pca_plot(points, training_points=None):
if training_points is None:
training_points = points
media_pca = PCA(n_components=2, random_state=0)
training_vecs = [model[source] for source in training_points]
vecs = [model[source] for source in points]
media_pca.fit_transform(training_vecs)
low_vecs = np.array(media_pca.transform(vecs))
fig, ax = plt.subplots()
ax.scatter(low_vecs[:,0], low_vecs[:,1])
for i, txt in enumerate(points):
ax.annotate(txt, (low_vecs[i, 0], low_vecs[i, 1]))
return fig
def pca_plot_data(points, training_points=None):
if training_points is None:
training_points = points
media_pca = PCA(n_components=2, random_state=0)
training_vecs = [model[source] for source in training_points]
vecs = [model[source] for source in points]
media_pca.fit_transform(training_vecs)
low_vecs = np.array(media_pca.transform(vecs))
return zip(points, low_vecs)
'''
fig, ax = plt.subplots()
ax.scatter(low_vecs[:,0], low_vecs[:,1])
for i, txt in enumerate(points):
ax.annotate(txt, (low_vecs[i, 0], low_vecs[i, 1]))
return fig
'''
job = Job()
model = job.run(lambda: gensim.models.Word2Vec.load('./data/v5/word_vectors.word2vec'), log_output=False, abort_on_fail=True)
job.run(lambda: model.most_similar(positive=['fox', 'democrat'], negative=['republican']))
job.run(lambda: model.most_similar(positive=['fox'], negative=['republican']))
job.run(lambda: model.most_similar(positive=['fox'], negative=['democrat']))
job.run(lambda: model.most_similar(positive=['fox', 'democrat'], negative=[]))
job.run(lambda: model.most_similar(positive=['clinton'], topn=10))
job.run(lambda: model.most_similar(positive=['hillary', 'republican'], negative=['democrat'], topn=10))
job.run(lambda: model.most_similar(positive=['clinton', 'donald'], negative=['hillary'], topn=10))
job.data(lambda: pca_plot_data(["trump", "clinton", "republican", "democrat", "obama", "sanders", "johnson", "bush", "pence"]), name="Politicians Plot 1")
job.data(lambda: pca_plot_data(["trump", "clinton", "republican", "democrat", "obama", "sanders", "johnson", "bush", "pence", "smart"]), name="Politicians Plot 2")
job.data(lambda: pca_plot_data(["fox", "msnbc", "republican", "democrat"]), name="Media Plot")
job.data(lambda: pca_plot_data(["trump", "clinton", "republican", "democrat"]), "Candidates Plot")
job.plot(lambda: pca_plot(["trump", "clinton", "republican", "democrat"]), "Candidates Plot")
job.finish()
import inspect
import uuid
import os
import tarfile
import shutil
import sys
import traceback
import cPickle
def safe_filename(s):
return "".join(x for x in s if x.isalnum())
class Job:
d = 1
fig = 1
dataf = 1
def __init__(self, job_id=None, print_err_to_stderr=True):
if job_id is None:
self.id = str(uuid.uuid1())
else:
self.id = job_id
self.dir = "%s.job" % self.id
os.makedirs(self.dir)
self.log = self.output_file("output.log")
self.error = [self.output_file("error.log")]
if print_err_to_stderr:
self.error.append(sys.stderr)
self.finished = False
def output_file(self, name):
return open(os.path.join(self.dir, name), "w")
def run(self, f, name=None, log_output=True, abort_on_fail=False):
if name is None:
name = str(self.d)
assert(not self.finished)
self.log.write("%s: %s\n" % (name, inspect.getsource(f).strip()))
try:
out = f()
if log_output:
self.log.write("%s\n" % str(out))
except:
self.log.write("Failed. See error.log for details.\n")
out = None
stacktrace = traceback.format_exc(sys.exc_info()[0])
for f in self.error: f.write("Run %s:\n%s\n\n" % (name, stacktrace))
if abort_on_fail:
self.finish()
assert(False)
self.log.write("\n")
self.d += 1
return out
def data(self, contents_f, name=None, abort_on_fail=False):
if name is None:
name = str(self.dataf)
try:
contents = contents_f()
with self.output_file("data_%s.cpkl" % safe_filename(name)) as f:
cPickle.dump(contents, f)
except:
stacktrace = traceback.format_exc(sys.exc_info()[0])
for f in self.error: f.write("Data %s:\n%s\n\n" % (name, stacktrace))
if abort_on_fail:
self.finish()
assert(False)
self.dataf += 1
def plot(self, fig_f, name=None, abort_on_fail=False):
if name is None:
name = str(self.fig)
try:
fig = fig_f()
fig.savefig(os.path.join(self.dir, "fig_%s.png" % safe_filename(name)), format='png', dpi=600)
except:
stacktrace = traceback.format_exc(sys.exc_info()[0])
for f in self.error: f.write("Figure %s:\n%s\n\n" % (name, stacktrace))
if abort_on_fail:
self.finish()
assert(False)
self.fig += 1
def finish(self):
assert(not self.finished)
self.log.close()
for f in self.error: f.close()
self.finished = True
tar = tarfile.open("%s.tar.gz" % self.dir, "w:gz")
tar.add(self.dir)
tar.close()
shutil.rmtree(self.dir)
print "Job %s completed!" % self.id
print "Outputs saved to %s.tar.gz" % self.dir
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment