Skip to content

Instantly share code, notes, and snippets.

@clee704
Created June 6, 2024 08:00
Show Gist options
  • Save clee704/011c4abca591fdd78561f968730f231c to your computer and use it in GitHub Desktop.
Save clee704/011c4abca591fdd78561f968730f231c to your computer and use it in GitHub Desktop.
rddviz.py
#!/bin/env python3
import argparse
import json
import os.path
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+", help="JSON files containing RDD infos per stage")
args = parser.parse_args()
rdd_infos_by_stage = {}
rdd_infos = {}
for path in args.files:
stage_id = int(os.path.basename(path).removesuffix('.json').removeprefix('stage_'))
with open(path) as f:
rdd_infos_in_stage = {rdd_info["id"]: rdd_info for rdd_info in json.load(f)}
rdd_infos_by_stage[stage_id] = rdd_infos_in_stage
rdd_infos.update(rdd_infos_in_stage)
rdd_infos_by_scope = {}
for rdd_info in rdd_infos.values():
rdd_infos_by_scope.setdefault(rdd_info["scope"]["id"], {})[rdd_info["id"]] = rdd_info
print("digraph G {")
print("node [shape=record, style=rounded];")
for stage_id in rdd_infos_by_stage:
print(f"subgraph cluster_stage_{stage_id} {{")
print(f"label=\"Stage {stage_id}\"")
for rdd_info in rdd_infos_by_stage[stage_id].values():
rdd_id = rdd_info["id"]
rdd_name = rdd_info["name"]
print(f"rdd_{rdd_id} [label=\"{rdd_name} ({rdd_id})\"];")
for scope_id in rdd_infos_by_scope:
rdd_infos_in_scope = rdd_infos_by_scope[scope_id].values()
rdd_info = list(rdd_infos_in_scope)[0]
print(f"subgraph cluster_scope_{scope_id} {{")
print(f"label=\"{rdd_info["scope"]["name"]}\";")
print("style=\"rounded,filled\";")
print("color=gray;")
for rdd_info in rdd_infos_in_scope:
rdd_id = rdd_info["id"]
if rdd_id in rdd_infos_by_stage[stage_id]:
print(f"rdd_{rdd_id};")
print("}")
print("}")
for rdd_info in rdd_infos.values():
rdd_id = rdd_info["id"]
for parent_id in rdd_info["parentIds"]:
parent_rdd_info = rdd_infos.get(parent_id)
if parent_rdd_info:
num_partitions = parent_rdd_info["numPartitions"]
print(f"rdd_{parent_id} -> rdd_{rdd_id} [label={num_partitions}];")
else:
print(f"rdd_{parent_id} -> rdd_{rdd_id};")
print("}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment