stas00 · April 15, 2021 18:01
diff --git a/dropout_abs_max_values.py b/dropout_abs_max_values.py
 # Samyam: I have three thoughts here:
 # 1) would dropping off large activations push the network towards producing smaller activations? I don't the answer but it feels unlikely as the network is not getting penalized in anyway for producing large activations,
 # 2) dropout is meant to be used as a regularization but by dropping out only large values, it's introducing a bias. It may have unexpected impact on convergence,
 # 3) if 1 does not happen then during time of inference where there is no dropout, we have the inf again


 def dropout_abs_max_values(x, p=0.2):
    """ Like Dropout but instead of random sampling, this one zeroth the p fraction of the biggest absolute values """
    topk = int(p * x.shape[-1])
    indices = torch.topk(x.abs(), topk, dim=-1, largest=True)[1]
    return x.scatter(-1, indices, 0)

 def dropout_bf16_to_fp16(x, p):
    if max(abs(x.min()), abs(x.max())) > 1e2:
        return dropout_abs_max_values(x, p)
    else:
        return torch.nn.functional.dropout(x, p)
	# Samyam: I have three thoughts here:
	# 1) would dropping off large activations push the network towards producing smaller activations? I don't the answer but it feels unlikely as the network is not getting penalized in anyway for producing large activations,
	# 2) dropout is meant to be used as a regularization but by dropping out only large values, it's introducing a bias. It may have unexpected impact on convergence,
	# 3) if 1 does not happen then during time of inference where there is no dropout, we have the inf again


	def dropout_abs_max_values(x, p=0.2):
	""" Like Dropout but instead of random sampling, this one zeroth the p fraction of the biggest absolute values """
	topk = int(p * x.shape[-1])
	indices = torch.topk(x.abs(), topk, dim=-1, largest=True)[1]
	return x.scatter(-1, indices, 0)

	def dropout_bf16_to_fp16(x, p):
	if max(abs(x.min()), abs(x.max())) > 1e2:
	return dropout_abs_max_values(x, p)
	else:
	return torch.nn.functional.dropout(x, p)