Created
June 6, 2024 08:00
-
-
Save clee704/011c4abca591fdd78561f968730f231c to your computer and use it in GitHub Desktop.
rddviz.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python3 | |
import argparse | |
import json | |
import os.path | |
parser = argparse.ArgumentParser() | |
parser.add_argument("files", nargs="+", help="JSON files containing RDD infos per stage") | |
args = parser.parse_args() | |
rdd_infos_by_stage = {} | |
rdd_infos = {} | |
for path in args.files: | |
stage_id = int(os.path.basename(path).removesuffix('.json').removeprefix('stage_')) | |
with open(path) as f: | |
rdd_infos_in_stage = {rdd_info["id"]: rdd_info for rdd_info in json.load(f)} | |
rdd_infos_by_stage[stage_id] = rdd_infos_in_stage | |
rdd_infos.update(rdd_infos_in_stage) | |
rdd_infos_by_scope = {} | |
for rdd_info in rdd_infos.values(): | |
rdd_infos_by_scope.setdefault(rdd_info["scope"]["id"], {})[rdd_info["id"]] = rdd_info | |
print("digraph G {") | |
print("node [shape=record, style=rounded];") | |
for stage_id in rdd_infos_by_stage: | |
print(f"subgraph cluster_stage_{stage_id} {{") | |
print(f"label=\"Stage {stage_id}\"") | |
for rdd_info in rdd_infos_by_stage[stage_id].values(): | |
rdd_id = rdd_info["id"] | |
rdd_name = rdd_info["name"] | |
print(f"rdd_{rdd_id} [label=\"{rdd_name} ({rdd_id})\"];") | |
for scope_id in rdd_infos_by_scope: | |
rdd_infos_in_scope = rdd_infos_by_scope[scope_id].values() | |
rdd_info = list(rdd_infos_in_scope)[0] | |
print(f"subgraph cluster_scope_{scope_id} {{") | |
print(f"label=\"{rdd_info["scope"]["name"]}\";") | |
print("style=\"rounded,filled\";") | |
print("color=gray;") | |
for rdd_info in rdd_infos_in_scope: | |
rdd_id = rdd_info["id"] | |
if rdd_id in rdd_infos_by_stage[stage_id]: | |
print(f"rdd_{rdd_id};") | |
print("}") | |
print("}") | |
for rdd_info in rdd_infos.values(): | |
rdd_id = rdd_info["id"] | |
for parent_id in rdd_info["parentIds"]: | |
parent_rdd_info = rdd_infos.get(parent_id) | |
if parent_rdd_info: | |
num_partitions = parent_rdd_info["numPartitions"] | |
print(f"rdd_{parent_id} -> rdd_{rdd_id} [label={num_partitions}];") | |
else: | |
print(f"rdd_{parent_id} -> rdd_{rdd_id};") | |
print("}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment