Verdi March verdimrc

https://developer.nvidia.com/deep-learning-performance-training-inference/training

# Additional args -- optional, on case-by-case basis
declare -a CONTAINER_ARGS=(                                                             
  --gpus all
  --ipc=host
  --ulimit memlock=1
  --ulimit stack=67108864

Slurm stuffs

1. Quickrun

# Rapid test
srun --nodes 2 --ntasks-per-node 1 /usr/bin/hostname

# Move job to another partition
scontrol update job <jobid> Partition=<partition_name>

Was enroot command on the controller node?
What is enroot.conf?
Instance type of controller node
The output of df -h
The output of mount command
The output of sudo du -shc /tmp/
The output of sudo du -shc /var/lib/*
The output of docker images
Run docker system prune to clean-up unused Docker caches

	interface CheckovRule {
	id: string,
	comment: string,
	}

	function silence_checkov(construct: Construct, rules: CheckovRule[]) {
	let metadata = (construct.node.defaultChild as cdk.CfnResource).cfnOptions.metadata;
	metadata = { checkov: { skip: rules }, ...metadata };
	(construct.node.defaultChild as cdk.CfnResource).cfnOptions.metadata = metadata
	}

	#!/bin/bash

	#set -aex

	echo "PWD = $(pwd)"

	: "${SM_NUM_GPUS:=4}"
	: "${MODEL_NAME:=gpt2}"
	: "${OUTPUT_ROOT:=/mnt/scratch}"
	: "${TRAINING_JOB_NAME:=haha}"

	# List content of object.inv as plain text to stdout.
	python -m sphinx.ext.intersphinx https://docs.python.org/3/objects.inv

	# Alternative
	pip install sphobjinv
	sphobjinv --help

	################################################################################
	# NVIDIA
	################################################################################
	nvidia-smi -l
	nvidia-smi -l 1
	nvidia-smi -l 1 --format=csv --query-gpu=gpu_name,index,utilization.memory,memory.total,memory.reserved,memory.used,memory.free
	nvidia-smi -l 1 --format=csv --query-gpu=gpu_name,index,utilization.memory,memory.total,memory.reserved,memory.used,memory.free 2>&1 \| tee /tmp/haha.txt

	declare -a ARGS=(
	-l 1

	# Miscellaneous references:
	# - https://docs.gitlab.com/ee/ci/unit_test_reports.html#python-example
	# - https://docs.gitlab.com/ee/user/project/merge_requests/code_quality.html
	# - https://stackoverflow.com/a/36358790

	# NOTES:
	# - apparently, gitlab is deprecating gitlab-runner exec
	# - alt OSS: https://github.com/firecow/gitlab-ci-local

	################################################################################

	# No isolation: don't build on a venv (because requirements*txt may be missing).
	pip install build setupext-janitor
	python3 -m build --wheel --no-isolation

	# Optional: remove build artifacts ONLY.
	VIRTUAL_ENV='' python setup.py clean --all

	# Optional: remove build artifacts PLUS the currently active virtual env.
	python setup.py clean --all

	import rich.pretty

	rich.pretty.install()

	import os

	from github3 import login

	gh = login(username="username", token=os.environ["token"])
	repo = gh.repository("username", "reponame")