Skip to content

Instantly share code, notes, and snippets.

@Seas0
Created July 7, 2025 06:45
Show Gist options
  • Save Seas0/26c50432c5cbab7e897aa5831968f152 to your computer and use it in GitHub Desktop.
Save Seas0/26c50432c5cbab7e897aa5831968f152 to your computer and use it in GitHub Desktop.
A simple wrapper for calling rpdb as post mortem mode when parallel training
def main():
...
if __name__ == "__main__":
try:
main()
except Exception as e:
# Spin up the rpdb remote debugger
import os, sys, pdb, bdb, traceback, importlib.util
if sys.gettrace() is not None:
# if already in debugging mode, raise exception to debugger
raise e
te, val, tb = sys.exc_info()
if (
te is None
or issubclass(te, bdb.BdbQuit)
or issubclass(te, KeyboardInterrupt)
or issubclass(te, SystemExit)
):
exit()
print(f"{te=}, {val=}")
if importlib.util.find_spec("rpdb"):
import rpdb
base_port = int(os.environ.get("PYTHON_RPDB_PORT", "4444"))
local_rank = int(os.environ.get("LOCAL_RANK", "0"))
rpdb.post_mortem(addr="0.0.0.0", port=base_port + local_rank)
else:
traceback.print_exc()
pdb.post_mortem(tb)
raise e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment