Skip to content

Instantly share code, notes, and snippets.

@mlin
Last active March 28, 2020 03:50
Show Gist options
  • Save mlin/6ec3a4bdea4fb0b215faa65ae0f22fd8 to your computer and use it in GitHub Desktop.
Save mlin/6ec3a4bdea4fb0b215faa65ae0f22fd8 to your computer and use it in GitHub Desktop.
paste_wdl_imports.py
#!/usr/bin/env python3
"""
Generate a standalone WDL document from a given workflow using imported tasks. Requires: miniwdl
python3 paste_wdl_imports.py [-o STANDALONE.wdl] WORKFLOW.wdl
For each "call imported_namespace.task_name [as alias]" in the workflow, appends the task's source
code with the task name changed to "imported_namespace__task_name", and rewrites the call to refer
to this new name (keeping the original alias). Also blanks out the import statements.
This is a quick-and-dirty hack with salient limitations:
1. Can't handle imported structs
2. Can't handle imported sub-workflows (tasks only)
3. Each call, task, and import must begin on its own line of code with only whitespace preceding.
Notes on these limitations:
1. Structs would be tricky to get right, but probably doable. The problem is they can be imported
recursively (including diamond imports) with different names in every document.
2. We can't do much about sub-workflows because a WDL document can only have one workflow.
3. Stems from a crappy regex-based approach used because miniwdl WDL library lacks code generation
"""
import re
import argparse
import WDL
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"-o",
metavar="STANDALONE.wdl",
help="output filename (default: standard output)",
default=None,
)
parser.add_argument(
"workflow_wdl", metavar="WORKFLOW.wdl", help="input workflow document with imports"
)
args = parser.parse_args()
# load workflow document
doc = WDL.load(args.workflow_wdl)
assert doc.workflow, "document has no workflow"
# run SetParents to facilitate getting from a called task to its containing document
WDL.Walker.SetParents()(doc)
# original document lines for surgery
doc_lines = doc.source_text.split("\n")
# for each call
tasks_processed = set()
for call in calls(doc.workflow):
if len(call.callee_id) > 1: # imported
# create name for pasted task
new_task_name = "__".join(call.callee_id)
assert isinstance(call.callee, WDL.Task), "can't import sub-workflows"
# rewrite the call with the new task name
doc_lines[call.pos.line - 1] = rewrite_line(
doc_lines[call.pos.line - 1],
"call",
f"{new_task_name} as {call.name}",
old_name="[0-9A-Za-z_\\.]+(\\s+as\\s+[0-9A-Za-z_]+)?",
)
if new_task_name not in tasks_processed:
task_lines = task_source_lines(call.callee)
task_lines[0] = rewrite_line(task_lines[0], "task", new_task_name)
doc_lines += ["\n"] + task_lines + ["\n"]
tasks_processed.add(new_task_name)
# blank out the imports
for imp in doc.imports:
for ln in range(imp.pos.line - 1, imp.pos.end_line):
doc_lines[ln] = ""
# print output
if args.o:
with open(args.o, "w") as outfile:
for line in doc_lines:
print(line, file=outfile)
else:
for line in doc_lines:
print(line)
def calls(element):
# Yield each Call in the workflow, including those nested withis scatter/conditional sections
for ch in element.children:
if isinstance(ch, WDL.Call):
yield ch
elif isinstance(ch, WDL.WorkflowSection):
yield from calls(ch)
def rewrite_line(line, keyword, new_name, old_name="[0-9A-Za-z_\\.]+"):
# given a line like:
# call OLD_NAME {
# replace OLD_NAME with new_name
parts = re.fullmatch(f"(?P<front>\\s*{keyword}\\s+){old_name}(?P<back>.*)", line)
assert parts, f"{keyword} should start on its own source line: {line}"
return parts.group("front") + new_name + parts.group("back")
def task_source_lines(task):
# copy the source lines of the given task and change the task name
task_doc = task
while not isinstance(task_doc, WDL.Document):
task_doc = task_doc.parent
if not hasattr(task_doc, "source_lines"):
setattr(task_doc, "source_lines", task_doc.source_text.split("\n"))
first_line = task_doc.source_lines[task.pos.line - 1][task.pos.column - 1 :]
assert first_line.startswith("task")
assert task.pos.end_line > task.pos.line, "expected task to span multiple lines"
ans = (
[first_line]
+ task_doc.source_lines[task.pos.line : (task.pos.end_line - 1)]
+ [task_doc.source_lines[task.pos.end_line - 1][: task.pos.end_column]]
)
assert ans[-1][-1] == "}"
return ans
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment