Skip to content

Instantly share code, notes, and snippets.

@eranbetzalel
Created August 12, 2025 19:21
Show Gist options
  • Save eranbetzalel/3dd96013539ff2d386e5f12d70e0b1cd to your computer and use it in GitHub Desktop.
Save eranbetzalel/3dd96013539ff2d386e5f12d70e0b1cd to your computer and use it in GitHub Desktop.
Debug PySpark executors in PyCharm with auto port discovery (no hard-coded port). Works inside mapPartitions, foreachPartition, and UDFs.

PySpark ⇄ PyCharm: Debug Inside Executors

A tiny helper to attach PyCharm’s debugger from Spark executors (e.g., mapPartitions, foreachPartition, UDFs) without hard-coding a port.

It walks up the parent process tree, finds the --port that PyCharm’s debug server started with, and connects executors to it. You can also override via env vars.


Install

pip install psutil pydevd-pycharm

Minimal Example

from pyspark.sql import SparkSession
from pyspark_pycharm_executor_debugger import debug_here

spark = SparkSession.builder.getOrCreate()
rdd = spark.sparkContext.parallelize(range(10), 2)

def work(it):
    debug_here(suspend=False)
    return (x * 2 for x in it)

print(rdd.mapPartitions(work).collect())
# SPDX-License-Identifier: MIT
# Debug PySpark executors in PyCharm without hard-coding a port.
# Works by discovering PyCharm's debug server port from parent process cmdlines.
from __future__ import annotations
import os
import sys
from functools import lru_cache
try:
import psutil # type: ignore
except Exception as _e: # pragma: no cover
psutil = None # graceful degradation
def _log(msg: str) -> None:
print(f"[executor-debug] {msg}")
def debug_here(host: str | None = None, port: int | None = None, suspend: bool = False) -> None:
"""Attach PyCharm debugger from inside an executor.
Args:
host: Debug server host. Defaults to env `PYCHARM_DEBUG_HOST` or '127.0.0.1'.
port: Debug server port. Defaults to env `PYCHARM_DEBUG_PORT` or discovered from parent `--port`.
suspend: If True, break immediately on attach.
"""
try:
import pydevd_pycharm # type: ignore
except Exception as e: # pragma: no cover
_log(f"pydevd_pycharm not available: {e}")
return
host, port = _resolve_host_port(host, port)
if port is None:
_log("No debug port found. Start PyCharm in Debug (it adds --port), or set PYCHARM_DEBUG_PORT.")
return
try:
pydevd_pycharm.settrace(
host=host,
port=port,
stdoutToServer=True,
stderrToServer=True,
suspend=suspend,
)
_log(f"Attached to PyCharm debug server at {host}:{port} (suspend={suspend}).")
except Exception as e:
_log(f"Failed to attach: {e}")
def _resolve_host_port(host: str | None, port: int | None) -> tuple[str, int | None]:
# Priority: explicit args → env → discovery → defaults
h = host or os.getenv("PYCHARM_DEBUG_HOST") or "127.0.0.1"
p_env = os.getenv("PYCHARM_DEBUG_PORT")
if port is not None:
return h, int(port)
if p_env:
try:
return h, int(p_env)
except ValueError:
_log(f"Invalid PYCHARM_DEBUG_PORT: {p_env}")
return h, _discover_pycharm_port()
@lru_cache(maxsize=1)
def _discover_pycharm_port(max_depth: int = 15) -> int | None:
"""Walk up parent processes to find a command line containing `--port <N>`.
Returns the integer port if found, else None.
"""
if psutil is None:
_log("psutil is not installed; cannot auto-discover port. Set PYCHARM_DEBUG_PORT.")
return None
try:
proc = psutil.Process()
depth = 0
while proc and depth <= max_depth:
parent = proc.parent()
if not parent:
return None
try:
cmdline = parent.cmdline() or []
except Exception:
cmdline = []
# Typical: .../pydevd.py --port 31294 --client 127.0.0.1 ...
if "--port" in cmdline:
try:
idx = cmdline.index("--port")
if idx + 1 < len(cmdline):
port_str = cmdline[idx + 1]
return int(port_str)
except Exception:
pass
proc = parent
depth += 1
except Exception as e:
_log(f"Error discovering port: {e}")
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment