Skip to content

Instantly share code, notes, and snippets.

@cbare
Last active August 29, 2015 14:10
Show Gist options
  • Select an option

  • Save cbare/6429bdace1f03065b8e6 to your computer and use it in GitHub Desktop.

Select an option

Save cbare/6429bdace1f03065b8e6 to your computer and use it in GitHub Desktop.
Read leaderboards and submission queue, build submission tarballs and metadata as .csv
################################################################
## get submissions and metadata for Synthetic challenge 4
################################################################
import synapseclient
from synapseclient import Activity
from synapseclient import Entity, Project, Folder, File
from synapseclient import Evaluation, Submission, SubmissionStatus
import os
import pandas as pd
import re
import tarfile
this_script = "https://gist.github.com/cbare/6429bdace1f03065b8e6"
scoring_code = "https://github.com/Sage-Bionetworks/DREAM-WGSMutationBenchmark"
def get_submissions(id, status=None):
bundles = syn._getSubmissionBundles(id, status)
for bundle in bundles:
submission = bundle['submission']
status = bundle['submissionStatus']
yield (
submission.get('id'),
submission.get('name'),
submission.get('entityId'),
submission.get('versionNumber',None),
submission.get('submitterAlias',None),
submission.get('userId'),
submission.get('createdOn'),
status.get('modifiedOn'),
status.get('status')
)
def submissions2df(id, status="SCORED"):
colnames = ['id', 'name', 'entityId', 'versionNumber', 'submitterAlias', 'userId', 'createdOn', 'modifiedOn', 'status']
return pd.DataFrame(data=get_submissions(id, status), columns=colnames)
def unescape(a):
return a.replace("\_", "_")
def remove_synapse_project_links(a):
return re.sub(r"\[(.*?)\]\(\#\!Synapse:syn\d+\)", "\\1", a)
def lb2df(lb):
"""
After cutting-n-pasting the leaderboard from the wiki page, this converts
the text into a data frame.
"""
lines = [line for line in lb.split("\n") if len(line) > 0]
header = re.sub(r"\|$", "", lines[0])
colnames = [unescape(a) for a in header.split("|")]
data = []
for line in lines[2:]:
data.append([remove_synapse_project_links(unescape(a)) for a in line.split("|")])
return pd.DataFrame.from_records(data=data, columns=colnames)
## Maybe it's better to just get a list of synapse IDs and then get the submissions
## off the scoring machine?
def download_submissions_as_submission_id(id, downloadLocation='.', status=None):
submissions = syn.getSubmissions(id, status)
for submission in submissions:
print submission.id, submission.name,
submission = syn.getSubmission(submission.id, downloadFile=True, downloadLocation=downloadLocation)
print submission.filePath,
_, ext = os.path.splitext(submission.filePath)
os.rename(submission.filePath, os.path.join(downloadLocation, "%s%s" % (submission.id, ext)))
print "=>", "%s%s" % (submission.id, ext)
## synthetic challenge 4
truth = "syn2495457"
normal = "syn2495934"
tumor = "syn2495935"
smc_dna = "syn312572"
subchals = [
{"type" : "snv",
"leaderboard_file" : "leaderboard_is4_snv.md",
"eval_id" : 2495938,
"leaderboard_url" : "https://www.synapse.org/#!Synapse:syn312572/wiki/64937"},
{"type" : "indel",
"leaderboard_file" : "leaderboard_is4_indel.md",
"eval_id" : 2495940,
"leaderboard_url" : "https://www.synapse.org/#!Synapse:syn312572/wiki/64937"},
{"type" : "sv",
"leaderboard_file" : "leaderboard_is4_sv.md",
"eval_id" : 2495936,
"leaderboard_url" : "https://www.synapse.org/#!Synapse:syn312572/wiki/64937"}]
submissions_download_location = '/Users/chris/Documents/work/projects/mutation_calling_challenge/submissions/synth_4'
if not os.path.exists(submissions_download_location):
os.mkdir(submissions_download_location)
syn = synapseclient.Synapse()
syn.login()
for subchal in subchals:
with open(subchal['leaderboard_file']) as f:
lbdf = lb2df(f.read())
subdf = submissions2df(subchal['eval_id'], status="SCORED")
df = pd.merge(lbdf, subdf, how='left', left_on='ID', right_on='id')
leaderboard_filename = "leaderboard_is4_"+subchal['type']+".csv"
df.to_csv(leaderboard_filename, encoding='utf-8')
submissions_tar_filename = "submissions_is4_"+subchal['type']+".tgz"
print "\n"
print "#" * 90
print "Run these commands in /home/ubuntu/DREAM-WGSMutationBenchmark/status_db on the scoring machine:"
print "-" * 90
print "tar -czvf", submissions_tar_filename, " ".join([ "%s.*" % i for i in df['ID'].values ])
print "synapse -u [username] store --file {submissions_tar_filename} --parentId syn2758129 " \
" --used {normal} {tumor} {smc_dna} --executed \"{this_script}\"".format(**locals())
print "\nRun this commands in %s on the local machine:" % os.getcwd()
print "-" * 90
print "synapse -u [username] store --file {leaderboard_filename} --parentId syn2758129 " \
" --used [submissions_SynID] {truth} {smc_dna} --executed \"{scoring_code}\" \"{this_script}\"".format(**locals())
## This finally worked after a struggle with some bugs, but it might be better
## to just get the files out of the scoring machine's status_db directory.
# submissions_dir = os.path.join(submissions_download_location,subchal['type'])
# if not os.path.exists(submissions_download_location):
# os.mkdir(submissions_dir)
# download_submissions_as_submission_id(subchal['eval_id'], downloadLocation=submissions_dir, status="SCORED")
# submissions_tar_filename = "submissions_is4_"+subchal['type']+".tgz"
# with tarfile.open(submissions_tar_filename, "w|gz") as tar:
# tar.add(submissions_dir)
## upload submissions tarballs to Synapse
# synapse -u chris.bare store --file submissions_is4_snv.tgz --parentId syn2758129 --used syn2495934 syn2495935 syn312572 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6"
# synapse -u chris.bare store --file submissions_is4_indel.tgz --parentId syn2758129 --used syn2495934 syn2495935 syn312572 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6"
# synapse -u chris.bare store --file submissions_is4_sv.tgz --parentId syn2758129 --used syn2495934 syn2495935 syn312572 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6"
## upload leaderboards in markdown format
# synapse -u chris.bare store --file leaderboard_is4_snv.md --parentId syn2758129 --used syn2866432 syn2495457 syn312572 --executed "https://github.com/Sage-Bionetworks/DREAM-WGSMutationBenchmark"
# synapse -u chris.bare store --file leaderboard_is4_indel.md --parentId syn2758129 --used syn2866440 syn2495457 syn312572 --executed "https://github.com/Sage-Bionetworks/DREAM-WGSMutationBenchmark"
# synapse -u chris.bare store --file leaderboard_is4_sv.md --parentId syn2758129 --used syn2866443 syn2495457 syn312572 --executed "https://github.com/Sage-Bionetworks/DREAM-WGSMutationBenchmark"
## upload leaderboards as CSVs
# synapse -u chris.bare store --file leaderboard_is4_snv.csv --parentId syn2758129 --used syn2866447 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6"
# synapse -u chris.bare store --file leaderboard_is4_indel.csv --parentId syn2758129 --used syn2866449 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6"
# synapse -u chris.bare store --file leaderboard_is4_sv.csv --parentId syn2758129 --used syn2866451 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment