Last active
November 25, 2016 04:01
-
-
Save kkleidal/e8635593af3277af82c2ea5cf1a539f6 to your computer and use it in GitHub Desktop.
For executing long-running ML jobs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
from job import Job | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.decomposition import PCA | |
# Load Google's pre-trained Word2Vec model. | |
# model = gensim.models.Word2Vec.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True) | |
def pca_plot(points, training_points=None): | |
if training_points is None: | |
training_points = points | |
media_pca = PCA(n_components=2, random_state=0) | |
training_vecs = [model[source] for source in training_points] | |
vecs = [model[source] for source in points] | |
media_pca.fit_transform(training_vecs) | |
low_vecs = np.array(media_pca.transform(vecs)) | |
fig, ax = plt.subplots() | |
ax.scatter(low_vecs[:,0], low_vecs[:,1]) | |
for i, txt in enumerate(points): | |
ax.annotate(txt, (low_vecs[i, 0], low_vecs[i, 1])) | |
return fig | |
def pca_plot_data(points, training_points=None): | |
if training_points is None: | |
training_points = points | |
media_pca = PCA(n_components=2, random_state=0) | |
training_vecs = [model[source] for source in training_points] | |
vecs = [model[source] for source in points] | |
media_pca.fit_transform(training_vecs) | |
low_vecs = np.array(media_pca.transform(vecs)) | |
return zip(points, low_vecs) | |
''' | |
fig, ax = plt.subplots() | |
ax.scatter(low_vecs[:,0], low_vecs[:,1]) | |
for i, txt in enumerate(points): | |
ax.annotate(txt, (low_vecs[i, 0], low_vecs[i, 1])) | |
return fig | |
''' | |
job = Job() | |
model = job.run(lambda: gensim.models.Word2Vec.load('./data/v5/word_vectors.word2vec'), log_output=False, abort_on_fail=True) | |
job.run(lambda: model.most_similar(positive=['fox', 'democrat'], negative=['republican'])) | |
job.run(lambda: model.most_similar(positive=['fox'], negative=['republican'])) | |
job.run(lambda: model.most_similar(positive=['fox'], negative=['democrat'])) | |
job.run(lambda: model.most_similar(positive=['fox', 'democrat'], negative=[])) | |
job.run(lambda: model.most_similar(positive=['clinton'], topn=10)) | |
job.run(lambda: model.most_similar(positive=['hillary', 'republican'], negative=['democrat'], topn=10)) | |
job.run(lambda: model.most_similar(positive=['clinton', 'donald'], negative=['hillary'], topn=10)) | |
job.data(lambda: pca_plot_data(["trump", "clinton", "republican", "democrat", "obama", "sanders", "johnson", "bush", "pence"]), name="Politicians Plot 1") | |
job.data(lambda: pca_plot_data(["trump", "clinton", "republican", "democrat", "obama", "sanders", "johnson", "bush", "pence", "smart"]), name="Politicians Plot 2") | |
job.data(lambda: pca_plot_data(["fox", "msnbc", "republican", "democrat"]), name="Media Plot") | |
job.data(lambda: pca_plot_data(["trump", "clinton", "republican", "democrat"]), "Candidates Plot") | |
job.plot(lambda: pca_plot(["trump", "clinton", "republican", "democrat"]), "Candidates Plot") | |
job.finish() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import inspect | |
import uuid | |
import os | |
import tarfile | |
import shutil | |
import sys | |
import traceback | |
import cPickle | |
def safe_filename(s): | |
return "".join(x for x in s if x.isalnum()) | |
class Job: | |
d = 1 | |
fig = 1 | |
dataf = 1 | |
def __init__(self, job_id=None, print_err_to_stderr=True): | |
if job_id is None: | |
self.id = str(uuid.uuid1()) | |
else: | |
self.id = job_id | |
self.dir = "%s.job" % self.id | |
os.makedirs(self.dir) | |
self.log = self.output_file("output.log") | |
self.error = [self.output_file("error.log")] | |
if print_err_to_stderr: | |
self.error.append(sys.stderr) | |
self.finished = False | |
def output_file(self, name): | |
return open(os.path.join(self.dir, name), "w") | |
def run(self, f, name=None, log_output=True, abort_on_fail=False): | |
if name is None: | |
name = str(self.d) | |
assert(not self.finished) | |
self.log.write("%s: %s\n" % (name, inspect.getsource(f).strip())) | |
try: | |
out = f() | |
if log_output: | |
self.log.write("%s\n" % str(out)) | |
except: | |
self.log.write("Failed. See error.log for details.\n") | |
out = None | |
stacktrace = traceback.format_exc(sys.exc_info()[0]) | |
for f in self.error: f.write("Run %s:\n%s\n\n" % (name, stacktrace)) | |
if abort_on_fail: | |
self.finish() | |
assert(False) | |
self.log.write("\n") | |
self.d += 1 | |
return out | |
def data(self, contents_f, name=None, abort_on_fail=False): | |
if name is None: | |
name = str(self.dataf) | |
try: | |
contents = contents_f() | |
with self.output_file("data_%s.cpkl" % safe_filename(name)) as f: | |
cPickle.dump(contents, f) | |
except: | |
stacktrace = traceback.format_exc(sys.exc_info()[0]) | |
for f in self.error: f.write("Data %s:\n%s\n\n" % (name, stacktrace)) | |
if abort_on_fail: | |
self.finish() | |
assert(False) | |
self.dataf += 1 | |
def plot(self, fig_f, name=None, abort_on_fail=False): | |
if name is None: | |
name = str(self.fig) | |
try: | |
fig = fig_f() | |
fig.savefig(os.path.join(self.dir, "fig_%s.png" % safe_filename(name)), format='png', dpi=600) | |
except: | |
stacktrace = traceback.format_exc(sys.exc_info()[0]) | |
for f in self.error: f.write("Figure %s:\n%s\n\n" % (name, stacktrace)) | |
if abort_on_fail: | |
self.finish() | |
assert(False) | |
self.fig += 1 | |
def finish(self): | |
assert(not self.finished) | |
self.log.close() | |
for f in self.error: f.close() | |
self.finished = True | |
tar = tarfile.open("%s.tar.gz" % self.dir, "w:gz") | |
tar.add(self.dir) | |
tar.close() | |
shutil.rmtree(self.dir) | |
print "Job %s completed!" % self.id | |
print "Outputs saved to %s.tar.gz" % self.dir |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment