Skip to content

Instantly share code, notes, and snippets.

@jayrbolton
Last active January 14, 2019 22:55
Show Gist options
  • Save jayrbolton/38febac7c21cf29ff8e32a725f800ac7 to your computer and use it in GitHub Desktop.
Save jayrbolton/38febac7c21cf29ff8e32a725f800ac7 to your computer and use it in GitHub Desktop.
# Define a docker workflow (independent of KBase)
# - output and input files
# - minimum node requirements for a job
# - whether to exit on any failure or continue on failure
# - pass through environment variables
# - htcondor, etc backend
# - automatically figure out serial and concurrent execution based on task input and output
def subsample():
return Task(
image='jgi/subsample',
input_files=['reads.fastq'],
node_requirements={'min_memory': '16GB'},
command=[
'reformat.sh',
'in=reads.fastq',
'out=subsample.fastq.gz',
'bhist=bhist.txt',
'gchist=gchist.txt',
'bqhist=bqhist.txt',
'obqhist=obqhist.txt',
'samplerate=0.1',
'qin=33',
'ow=t',
'gcplot=t',
'gcbins=auto'
]
)
def mer_sampling(env):
return Task(
image='jgi/mer_sampling',
input_files=['reads.fastq'],
node_requirements={
'min_cores': 2,
'min_memory': '8GB'
},
command=[
'bbcountunique.sh',
'in=reads.fastq',
'out=merSampler.m25.e25000_2',
'k=25',
'percent=t',
'count=t',
'cumulative=f',
'ow=t'
]
)
def dedupe():
return Task(
image='jgi/subsample',
input_files=['subsample'],
command=[
'reformat.sh',
'in=subsample',
'out=subsample.fastqc.gz',
'bhist=bhist.txt',
'gchist=gchist.txt',
'bqhist=bqhist.txt',
'obqhist=obqhist.txt',
'samplerate=0.1',
'qin=33',
'ow=t',
'gcplot=t',
'gcbins=auto'
]
)
def sketch(db):
return Task(
image='jgi/sketch',
input_files=['infile'],
command=[
'sendsketch.sh',
'in=infile',
'out=sketch_vs_' + db + '.txt',
'ow=t',
'colors=f',
'printtaxa=t',
'depth',
'depth2',
'unique2',
'merge',
db
]
)
def jgi_read_qc(fastq_path, env):
"""Full Read QC task."""
sketch_dbs = ["nt", "refseq", "silva"]
subsample_task = subsample()
.inputs({'reads.fastq': fastq_path})
.set_env(env)
subsample_outfile = subsample_task.outfile('subsample.fastqc.gz')
mer_sampling_task = mer_sampling()
.inputs({'reads.fastq': fastq_path})
.set_env(env)
dedupe_task = dedupe()
.inputs(subsample=subsample_outfile)
.set_env(env)
sketch_tasks = {} # type: dict
for db in sketch_dbs:
sketch_tasks['sketch_task_' + db] = sketch(db).inputs({'infile': fastq_path}).set_env(env)
return Tasks({
'subsample': subsample_task,
'dedupe': dedupe_task,
'mer_sampling': mer_sampling_task,
'sketch_tasks': sketch_tasks
})
# Interfacing with KBase
# - define kbase fields and forms using python
# - take values from the form and pass it to the task
# - take output from the task and make kbase objects and a final report
def narrative_form(params):
"""
Define a form that can go in a KBase narrative cell
"""
field = KBaseForm.text_field({
'type': 'KBaseAssembly.PairedEndLibrary',
'label': 'Reads Input',
'description': 'This is a description'
})
return KBaseForm([field])
def run_task(form_data, task, env):
"""
Take the data from a narrative cell and create the jgi_read_qc task from it.
"""
# Download the reads object to a fastq path, then pass the file path as a param to the task
fastq_path = form_data['reads']['fastq_path']
return jgi_read_qc(fastq_path, env)
def task_output_to_kbase(results, form_data):
"""Take the results from the readqc task (plus the original form data) and create a report."""
report = KBaseReport(
html='path/to/html/directory',
files={
'bhist.txt': {
'title': 'Base Frequency Histogram',
'data': results['subsample']['bhist.txt']
}
# etc..
},
objects={
# .. kbase object references
}
)
return report
# convert some output
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment