Yulei Qin yuleichin

Multi-node-training on slurm with PyTorch

A simple note for how to start multi-node-training on slurm scheduler with PyTorch.
Useful especially when scheduler is too busy that you cannot get multiple GPUs allocated, or you need more than 4 GPUs for a single job.
Requirement: Have to use PyTorch DistributedDataParallel(DDP) for this purpose.
Warning: might need to re-factor your own code.
Warning: might be secretly condemned by your colleagues because using too many GPUs.

	git init # 初始化本地git仓库（创建新仓库）
	git config --global user.name "xxx" # 配置用户名
	git config --global user.email "[email protected]" # 配置邮件
	git config --global color.ui true # git status等命令自动着色
	git config --global color.status auto
	git config --global color.diff auto
	git config --global color.branch auto
	git config --global color.interactive auto
	git config --global --unset http.proxy # remove proxy configuration on git
	git clone git+ssh://[email protected]/VT.git # clone远程仓库

	import numpy
	from scipy.ndimage.interpolation import map_coordinates
	from scipy.ndimage.filters import gaussian_filter

	def elastic_transform(image, alpha, sigma, random_state=None):
	"""Elastic deformation of images as described in [Simard2003]_.


	.. [Simard2003] Simard, Steinkraus and Platt, "Best Practices for
	Convolutional Neural Networks applied to Visual Document Analysis", in

	import numpy as np
	from scipy.ndimage.interpolation import map_coordinates
	from scipy.ndimage.filters import gaussian_filter

	def elastic_transform(image, alpha, sigma, random_state=None):
	"""Elastic deformation of images as described in [Simard2003]_.


	.. [Simard2003] Simard, Steinkraus and Platt, "Best Practices for
	Convolutional Neural Networks applied to Visual Document Analysis", in

	import torch
	import torch.nn as nn


	def log_sum_exp(x):
	# See implementation detail in
	# http://timvieira.github.io/blog/post/2014/02/11/exp-normalize-trick/
	# b is a shift factor. see link.
	# x.size() = [N, C]:
	b, _ = torch.max(x, 1)

	"""
	Functions that implement some of the same functionality found in Matlab's bwmorph.

	`thin` - was taken and adapted from https://gist.github.com/joefutrelle/562f25bbcf20691217b8
	`spur` - Not perfect but pretty close to what matlab does via LUTs
	`endpoints` - lines up perfectly with matlab's output (in my limited testing)
	`branches` - this results in more clustered pixels than matlab's version but it pretty close
	"""
	import numpy as np
	import scipy.ndimage as ndi

	def soft_dice_loss(y_true, y_pred, epsilon=1e-6):
	'''
	Soft dice loss calculation for arbitrary batch size, number of classes, and number of spatial dimensions.
	Assumes the `channels_last` format.

	# Arguments
	y_true: b x X x Y( x Z...) x c One hot encoding of ground truth
	y_pred: b x X x Y( x Z...) x c Network output, must sum to 1 over c channel (such as after softmax)
	epsilon: Used for numerical stability to avoid divide by zero errors

	import torch
	import torch.nn as nn

	class conv_block_nested(nn.Module):

	def __init__(self, in_ch, mid_ch, out_ch):
	super(conv_block_nested, self).__init__()
	self.activation = nn.ReLU(inplace=True)
	self.conv1 = nn.Conv2d(in_ch, mid_ch, kernel_size=3, padding=1, bias=True)
	self.bn1 = nn.BatchNorm2d(mid_ch)