Skip to content

Instantly share code, notes, and snippets.

@gdlmx
Last active February 12, 2023 13:54
Show Gist options
  • Save gdlmx/92a355a0578b8d460d5ba4120437b039 to your computer and use it in GitHub Desktop.
Save gdlmx/92a355a0578b8d460d5ba4120437b039 to your computer and use it in GitHub Desktop.
Scripts for spark on Windows
" check env variables "
# pylint: disable=invalid-name,line-too-long
import os
import pathlib
import subprocess
import logging
logging.basicConfig(level=logging.INFO, format='%(message)s' )
paths_should_exist: list[str]=[
'{DOTNET_ROOT}/dotnet.exe',
'{DOTNET_WORKER_DIR}/Microsoft.Spark.Worker.exe',
'{HADOOP_HOME}/bin/winutils.exe',
'{JAVA_HOME}/bin/java.exe',
'{SPARK_HOME}/bin/spark-class.cmd',
'{SPARK_CONF_DIR}/spark-defaults.conf',
'{SPARK_CONF_DIR}/log4j2.properties',
]
__ps_args_batch = ('powershell.exe', '-NoLogo', '-NoProfile', '-OutputFormat', 'Text', '-Command', )
_ERROR: int = logging.ERROR
def logprint(*args, level:int=logging.INFO) -> None:
" print "
msg: str = ' '.join(('%s' for _ in args))
logging.log( level, msg, *args )
def get_win_env_setting(name, scope="User") -> str:
" get env setting "
return subprocess.check_output(list(__ps_args_batch) + [
f"[Environment]::GetEnvironmentVariable('{name}',[System.EnvironmentVariableTarget]::{scope})"
] ).decode()
def ls_user_env() :
" list user envs "
for k,v in os.environ.items():
try:
v1: str = get_win_env_setting(k).strip()
if v1:
yield k, v, v1
except subprocess.CalledProcessError:
pass
def check_user_env() -> None:
" check dir in user env settings "
for k, v, v1 in ls_user_env():
logprint(k,'=', v)
if v1!=v:
logprint( ' '*max(0,len(k)-6)+'[User]','=', v1)
if k.split('_')[-1].upper() in ('DIR','HOME','ROOT') and ";" not in v:
if pathlib.Path(v).exists():
logprint(f"Checked: {v}")
else:
logprint(f"{v} is missing (defined in {k})", level=_ERROR)
def check_spark_env() -> None:
" check each path "
for p in paths_should_exist:
try:
p = pathlib.Path(p.format(**os.environ))
assert p.exists(), f"{p} is missing"
except AssertionError as err:
logprint("ERROR:", err, level=_ERROR)
else:
logprint('Checked:', p)
if __name__ == "__main__":
check_spark_env()
check_user_env()
Start-Process -FilePath 'spark-class.cmd' -ArgumentList ('org.apache.spark.deploy.master.Master','--host', "localhost")
Start-Process -FilePath 'spark-class.cmd' -ArgumentList ('org.apache.spark.deploy.worker.Worker',"spark://localhost:7077" )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment