Last active
August 29, 2015 14:10
-
-
Save cbare/6429bdace1f03065b8e6 to your computer and use it in GitHub Desktop.
Read leaderboards and submission queue, build submission tarballs and metadata as .csv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ################################################################ | |
| ## get submissions and metadata for Synthetic challenge 4 | |
| ################################################################ | |
| import synapseclient | |
| from synapseclient import Activity | |
| from synapseclient import Entity, Project, Folder, File | |
| from synapseclient import Evaluation, Submission, SubmissionStatus | |
| import os | |
| import pandas as pd | |
| import re | |
| import tarfile | |
| this_script = "https://gist.github.com/cbare/6429bdace1f03065b8e6" | |
| scoring_code = "https://github.com/Sage-Bionetworks/DREAM-WGSMutationBenchmark" | |
| def get_submissions(id, status=None): | |
| bundles = syn._getSubmissionBundles(id, status) | |
| for bundle in bundles: | |
| submission = bundle['submission'] | |
| status = bundle['submissionStatus'] | |
| yield ( | |
| submission.get('id'), | |
| submission.get('name'), | |
| submission.get('entityId'), | |
| submission.get('versionNumber',None), | |
| submission.get('submitterAlias',None), | |
| submission.get('userId'), | |
| submission.get('createdOn'), | |
| status.get('modifiedOn'), | |
| status.get('status') | |
| ) | |
| def submissions2df(id, status="SCORED"): | |
| colnames = ['id', 'name', 'entityId', 'versionNumber', 'submitterAlias', 'userId', 'createdOn', 'modifiedOn', 'status'] | |
| return pd.DataFrame(data=get_submissions(id, status), columns=colnames) | |
| def unescape(a): | |
| return a.replace("\_", "_") | |
| def remove_synapse_project_links(a): | |
| return re.sub(r"\[(.*?)\]\(\#\!Synapse:syn\d+\)", "\\1", a) | |
| def lb2df(lb): | |
| """ | |
| After cutting-n-pasting the leaderboard from the wiki page, this converts | |
| the text into a data frame. | |
| """ | |
| lines = [line for line in lb.split("\n") if len(line) > 0] | |
| header = re.sub(r"\|$", "", lines[0]) | |
| colnames = [unescape(a) for a in header.split("|")] | |
| data = [] | |
| for line in lines[2:]: | |
| data.append([remove_synapse_project_links(unescape(a)) for a in line.split("|")]) | |
| return pd.DataFrame.from_records(data=data, columns=colnames) | |
| ## Maybe it's better to just get a list of synapse IDs and then get the submissions | |
| ## off the scoring machine? | |
| def download_submissions_as_submission_id(id, downloadLocation='.', status=None): | |
| submissions = syn.getSubmissions(id, status) | |
| for submission in submissions: | |
| print submission.id, submission.name, | |
| submission = syn.getSubmission(submission.id, downloadFile=True, downloadLocation=downloadLocation) | |
| print submission.filePath, | |
| _, ext = os.path.splitext(submission.filePath) | |
| os.rename(submission.filePath, os.path.join(downloadLocation, "%s%s" % (submission.id, ext))) | |
| print "=>", "%s%s" % (submission.id, ext) | |
| ## synthetic challenge 4 | |
| truth = "syn2495457" | |
| normal = "syn2495934" | |
| tumor = "syn2495935" | |
| smc_dna = "syn312572" | |
| subchals = [ | |
| {"type" : "snv", | |
| "leaderboard_file" : "leaderboard_is4_snv.md", | |
| "eval_id" : 2495938, | |
| "leaderboard_url" : "https://www.synapse.org/#!Synapse:syn312572/wiki/64937"}, | |
| {"type" : "indel", | |
| "leaderboard_file" : "leaderboard_is4_indel.md", | |
| "eval_id" : 2495940, | |
| "leaderboard_url" : "https://www.synapse.org/#!Synapse:syn312572/wiki/64937"}, | |
| {"type" : "sv", | |
| "leaderboard_file" : "leaderboard_is4_sv.md", | |
| "eval_id" : 2495936, | |
| "leaderboard_url" : "https://www.synapse.org/#!Synapse:syn312572/wiki/64937"}] | |
| submissions_download_location = '/Users/chris/Documents/work/projects/mutation_calling_challenge/submissions/synth_4' | |
| if not os.path.exists(submissions_download_location): | |
| os.mkdir(submissions_download_location) | |
| syn = synapseclient.Synapse() | |
| syn.login() | |
| for subchal in subchals: | |
| with open(subchal['leaderboard_file']) as f: | |
| lbdf = lb2df(f.read()) | |
| subdf = submissions2df(subchal['eval_id'], status="SCORED") | |
| df = pd.merge(lbdf, subdf, how='left', left_on='ID', right_on='id') | |
| leaderboard_filename = "leaderboard_is4_"+subchal['type']+".csv" | |
| df.to_csv(leaderboard_filename, encoding='utf-8') | |
| submissions_tar_filename = "submissions_is4_"+subchal['type']+".tgz" | |
| print "\n" | |
| print "#" * 90 | |
| print "Run these commands in /home/ubuntu/DREAM-WGSMutationBenchmark/status_db on the scoring machine:" | |
| print "-" * 90 | |
| print "tar -czvf", submissions_tar_filename, " ".join([ "%s.*" % i for i in df['ID'].values ]) | |
| print "synapse -u [username] store --file {submissions_tar_filename} --parentId syn2758129 " \ | |
| " --used {normal} {tumor} {smc_dna} --executed \"{this_script}\"".format(**locals()) | |
| print "\nRun this commands in %s on the local machine:" % os.getcwd() | |
| print "-" * 90 | |
| print "synapse -u [username] store --file {leaderboard_filename} --parentId syn2758129 " \ | |
| " --used [submissions_SynID] {truth} {smc_dna} --executed \"{scoring_code}\" \"{this_script}\"".format(**locals()) | |
| ## This finally worked after a struggle with some bugs, but it might be better | |
| ## to just get the files out of the scoring machine's status_db directory. | |
| # submissions_dir = os.path.join(submissions_download_location,subchal['type']) | |
| # if not os.path.exists(submissions_download_location): | |
| # os.mkdir(submissions_dir) | |
| # download_submissions_as_submission_id(subchal['eval_id'], downloadLocation=submissions_dir, status="SCORED") | |
| # submissions_tar_filename = "submissions_is4_"+subchal['type']+".tgz" | |
| # with tarfile.open(submissions_tar_filename, "w|gz") as tar: | |
| # tar.add(submissions_dir) | |
| ## upload submissions tarballs to Synapse | |
| # synapse -u chris.bare store --file submissions_is4_snv.tgz --parentId syn2758129 --used syn2495934 syn2495935 syn312572 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6" | |
| # synapse -u chris.bare store --file submissions_is4_indel.tgz --parentId syn2758129 --used syn2495934 syn2495935 syn312572 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6" | |
| # synapse -u chris.bare store --file submissions_is4_sv.tgz --parentId syn2758129 --used syn2495934 syn2495935 syn312572 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6" | |
| ## upload leaderboards in markdown format | |
| # synapse -u chris.bare store --file leaderboard_is4_snv.md --parentId syn2758129 --used syn2866432 syn2495457 syn312572 --executed "https://github.com/Sage-Bionetworks/DREAM-WGSMutationBenchmark" | |
| # synapse -u chris.bare store --file leaderboard_is4_indel.md --parentId syn2758129 --used syn2866440 syn2495457 syn312572 --executed "https://github.com/Sage-Bionetworks/DREAM-WGSMutationBenchmark" | |
| # synapse -u chris.bare store --file leaderboard_is4_sv.md --parentId syn2758129 --used syn2866443 syn2495457 syn312572 --executed "https://github.com/Sage-Bionetworks/DREAM-WGSMutationBenchmark" | |
| ## upload leaderboards as CSVs | |
| # synapse -u chris.bare store --file leaderboard_is4_snv.csv --parentId syn2758129 --used syn2866447 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6" | |
| # synapse -u chris.bare store --file leaderboard_is4_indel.csv --parentId syn2758129 --used syn2866449 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6" | |
| # synapse -u chris.bare store --file leaderboard_is4_sv.csv --parentId syn2758129 --used syn2866451 --executed "https://gist.github.com/cbare/6429bdace1f03065b8e6" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment