trygvebw · April 29, 2024 07:50 · trygvebw · Mar 29, 2023 · zhujiagang · Apr 29, 2024
diff --git a/blur_latent_noise.py b/blur_latent_noise.py
 # This is an abbreviated demonstration of how to perform this technique. The code
 # is a simplified version of that in my own custom codebase, and can't be plugged
 # into other ways of using Stable Diffusion (e.g. Diffusers or A1111) without changes.

 # In essence, the observation that the CFG formula:
 #
 #     output_noise = uncond_noise + (cond_noise - uncond_noise) * scale
 #
 # looks a lot like the formula for the unsharp mask, a common way to sharpen or add local contrast to images:
 #
 #     sharpened_image = original_image + (original_image − gaussian_blurred_image) * strength
 #
 # led me to try applying a "reverse" unsharp mask to the conditional output of the model to try to get rid of the
 # easily noticeable artifacts and exaggerated contrast and saturation the output images tend to suffer from when
 # generating images with Stable Diffusion at high classifier-free guidance (CFG) scales.
 # 
 # I've previously spent quite a bit of time trying to fix this issue, see e.g. my post here:
 # https://www.reddit.com/r/StableDiffusion/comments/xalo78/fixing_excessive_contrastsaturation_resulting/
 # ...but the results using those techniques are far from perfect: they tend to cause output images to suffer
 # from other issues like desaturation, and are not powerful enough to handle extreme CFG scales like 60 or 80.
 # In contrast, this method performs a lot better, and by careful tuning of the parameters you can use very
 # high CFG scales indeed without substantially degrading output image quality.
 #
 # One question that's important to answer preemptively is whether effectively blurring the conditional model output
 # is "cheating", in the sense that the images generated using this method at high CFG scales are "effectively" generated
 # at a much lower CFG scale because of the blur. It's hard to say exactly what an image generated at a high CFG scale
 # "should" look like if generated using a proper, theory-backed fix to the problem at hand (which this isn't), but my
 # intuition says that at high CFG scales (higher than the usual range of roughly 0.0–15.0), generated images should
 # 
 #     A) display better prompt adherence than one generated at a lower CFG scale
 #     B) display less diversity, i.e. two images generated with different initial noises should be more similar
 #
 # I can't definitely prove that the images generated at high CFG scales using this technique display these two properties,
 # but I absolutely *think* they do – you be the judge. :)

 # A few additional notes:
 #
 #     - applying unsharp mask not to the predicted noise but to the predicted final latent (x0) also works (you'll need
 #       to experiment with parameters, though) – I apply it to the noise because of the way my own codebase is structured.
 #     - since the unsharp mask is simply a convoluted way of applying a Gaussian blur, you can also simply blur the output
 #       noise instead of applying an unsharp mask. I do it this way because of the similarity between CFG and the unsharp
 #       mask noted previously.
 #     - I've unsuccessfully experimented with various methods to autodetect a suitable mix_factor or unsharp mask sigma
 #       (by adjusting one or both based on the estimated noisel level, the global contrast level or the maximum edge contrast).
 #       If you figure out a smart way to do this, please tell me about it! Right now I use a simple adjustment based on
 #       CFG scale, but this is definitely not optimal.
 #     - you still can't turn the CFG scale arbitrarily high – the exact max that still gives good-looking images varies,
 #       depending on prompt and the number of steps, but on occasion I've successfully used CFG scales just over 100.

 import torch
 import kornia as KR
 import k_diffusion as K

 DEFAULT_MIX_FACTOR = 0.003

 # The most important parameter here is the mix factor, which is the main parameter
 def apply_unsharp_mask(pred_noise, denoising_sigma, cfg_scale, mix_factor=DEFAULT_MIX_FACTOR, kernel_size=3):
    cond_scale_factor = min(0.02 * cfg_scale, 0.65)
    usm_sigma = torch.clamp(
        1 + denoising_sigma[[0]] * cond_scale_factor,
        min=1e-6)
    
    sharpened = KR.filters.unsharp_mask(
        pred_noise,
        (kernel_size, kernel_size),
        (usm_sigma, usm_sigma),
        border_type='reflect'
    )
    pred_noise = pred_noise + (sharpened - pred_noise) * mix_factor
    
    return pred_noise

 # This class is based on Katherine Crowson's DiscreteEpsDDPMDenoiser:
 # https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
 class DenoiserWrapper(K.external.DiscreteSchedule):
    def __init__(self, model, quantize, cond_emb, uncond_emb, cfg_scale, unsharp_mask_mix_factor=DEFAULT_MIX_FACTOR):
        super().__init__(((1 - model.alphas_cumprod) / model.alphas_cumprod) ** 0.5, quantize)
        
        self.inner_model = model
        self.sigma_data = 1.
        self.cond_emb = cond_emb  # Conditional embedding
        self.uncond_emb = uncond_emb  # "Unconditional embedding", i.e. the embedding of the empty string
        self.cfg_scale = cfg_scale
        self.unsharp_mask_mix_factor = unsharp_mask_mix_factor

    def get_scalings(self, sigma):
        c_out = -sigma
        c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
        return c_out, c_in

    def get_eps(self, *args, **kwargs):
        return self.inner_model.apply_model(*args, **kwargs)

    def loss(self, input, noise, sigma, **kwargs):
        c_out, c_in = [K.utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
        noised_input = input + noise * K.utils.append_dims(sigma, input.ndim)
        eps = self.get_eps(noised_input * c_in, self.sigma_to_t(sigma), **kwargs)
        return (eps - noise).pow(2).flatten(1).mean(1)

    def forward(self, input, sigma, **kwargs):
        c_out, c_in = [K.utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
        
        uncond_eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), cond=self.uncond_emb, **kwargs)
        
        cond_eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), cond=self.cond_emb, **kwargs)
        cond_eps = apply_unsharp_mask(cond_eps, sigma, self.cfg_scale, mix_factor=self.unsharp_mask_mix_factor)
        
        eps = uncond_eps + (cond_eps - uncond_eps) * self.cfg_scale
        return input + eps * c_out
	# This is an abbreviated demonstration of how to perform this technique. The code
	# is a simplified version of that in my own custom codebase, and can't be plugged
	# into other ways of using Stable Diffusion (e.g. Diffusers or A1111) without changes.

	# In essence, the observation that the CFG formula:
	#
	# output_noise = uncond_noise + (cond_noise - uncond_noise) * scale
	#
	# looks a lot like the formula for the unsharp mask, a common way to sharpen or add local contrast to images:
	#
	# sharpened_image = original_image + (original_image − gaussian_blurred_image) * strength
	#
	# led me to try applying a "reverse" unsharp mask to the conditional output of the model to try to get rid of the
	# easily noticeable artifacts and exaggerated contrast and saturation the output images tend to suffer from when
	# generating images with Stable Diffusion at high classifier-free guidance (CFG) scales.
	#
	# I've previously spent quite a bit of time trying to fix this issue, see e.g. my post here:
	# https://www.reddit.com/r/StableDiffusion/comments/xalo78/fixing_excessive_contrastsaturation_resulting/
	# ...but the results using those techniques are far from perfect: they tend to cause output images to suffer
	# from other issues like desaturation, and are not powerful enough to handle extreme CFG scales like 60 or 80.
	# In contrast, this method performs a lot better, and by careful tuning of the parameters you can use very
	# high CFG scales indeed without substantially degrading output image quality.
	#
	# One question that's important to answer preemptively is whether effectively blurring the conditional model output
	# is "cheating", in the sense that the images generated using this method at high CFG scales are "effectively" generated
	# at a much lower CFG scale because of the blur. It's hard to say exactly what an image generated at a high CFG scale
	# "should" look like if generated using a proper, theory-backed fix to the problem at hand (which this isn't), but my
	# intuition says that at high CFG scales (higher than the usual range of roughly 0.0–15.0), generated images should
	#
	# A) display better prompt adherence than one generated at a lower CFG scale
	# B) display less diversity, i.e. two images generated with different initial noises should be more similar
	#
	# I can't definitely prove that the images generated at high CFG scales using this technique display these two properties,
	# but I absolutely think they do – you be the judge. :)

	# A few additional notes:
	#
	# - applying unsharp mask not to the predicted noise but to the predicted final latent (x0) also works (you'll need
	# to experiment with parameters, though) – I apply it to the noise because of the way my own codebase is structured.
	# - since the unsharp mask is simply a convoluted way of applying a Gaussian blur, you can also simply blur the output
	# noise instead of applying an unsharp mask. I do it this way because of the similarity between CFG and the unsharp
	# mask noted previously.
	# - I've unsuccessfully experimented with various methods to autodetect a suitable mix_factor or unsharp mask sigma
	# (by adjusting one or both based on the estimated noisel level, the global contrast level or the maximum edge contrast).
	# If you figure out a smart way to do this, please tell me about it! Right now I use a simple adjustment based on
	# CFG scale, but this is definitely not optimal.
	# - you still can't turn the CFG scale arbitrarily high – the exact max that still gives good-looking images varies,
	# depending on prompt and the number of steps, but on occasion I've successfully used CFG scales just over 100.

	import torch
	import kornia as KR
	import k_diffusion as K

	DEFAULT_MIX_FACTOR = 0.003

	# The most important parameter here is the mix factor, which is the main parameter
	def apply_unsharp_mask(pred_noise, denoising_sigma, cfg_scale, mix_factor=DEFAULT_MIX_FACTOR, kernel_size=3):
	cond_scale_factor = min(0.02 * cfg_scale, 0.65)
	usm_sigma = torch.clamp(
	1 + denoising_sigma[[0]] * cond_scale_factor,
	min=1e-6)

	sharpened = KR.filters.unsharp_mask(
	pred_noise,
	(kernel_size, kernel_size),
	(usm_sigma, usm_sigma),
	border_type='reflect'
	)
	pred_noise = pred_noise + (sharpened - pred_noise) * mix_factor

	return pred_noise

	# This class is based on Katherine Crowson's DiscreteEpsDDPMDenoiser:
	# https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
	class DenoiserWrapper(K.external.DiscreteSchedule):
	def __init__(self, model, quantize, cond_emb, uncond_emb, cfg_scale, unsharp_mask_mix_factor=DEFAULT_MIX_FACTOR):
	super().__init__(((1 - model.alphas_cumprod) / model.alphas_cumprod) ** 0.5, quantize)

	self.inner_model = model
	self.sigma_data = 1.
	self.cond_emb = cond_emb # Conditional embedding
	self.uncond_emb = uncond_emb # "Unconditional embedding", i.e. the embedding of the empty string
	self.cfg_scale = cfg_scale
	self.unsharp_mask_mix_factor = unsharp_mask_mix_factor

	def get_scalings(self, sigma):
	c_out = -sigma
	c_in = 1 / (sigma 2 + self.sigma_data 2) ** 0.5
	return c_out, c_in

	def get_eps(self, args, *kwargs):
	return self.inner_model.apply_model(args, *kwargs)

	def loss(self, input, noise, sigma, **kwargs):
	c_out, c_in = [K.utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
	noised_input = input + noise * K.utils.append_dims(sigma, input.ndim)
	eps = self.get_eps(noised_input * c_in, self.sigma_to_t(sigma), **kwargs)
	return (eps - noise).pow(2).flatten(1).mean(1)

	def forward(self, input, sigma, **kwargs):
	c_out, c_in = [K.utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]

	uncond_eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), cond=self.uncond_emb, **kwargs)

	cond_eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), cond=self.cond_emb, **kwargs)
	cond_eps = apply_unsharp_mask(cond_eps, sigma, self.cfg_scale, mix_factor=self.unsharp_mask_mix_factor)

	eps = uncond_eps + (cond_eps - uncond_eps) * self.cfg_scale
	return input + eps * c_out