This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is an abbreviated demonstration of how to perform this technique. The code | |
# is a simplified version of that in my own custom codebase, and can't be plugged | |
# into other ways of using Stable Diffusion (e.g. Diffusers or A1111) without changes. | |
# In essence, the observation that the CFG formula: | |
# | |
# output_noise = uncond_noise + (cond_noise - uncond_noise) * scale | |
# | |
# looks a lot like the formula for the unsharp mask, a common way to sharpen or add local contrast to images: | |
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import numpy as np | |
import k_diffusion as K | |
from PIL import Image | |
from torch import autocast | |
from einops import rearrange, repeat | |
def pil_img_to_torch(pil_img, half=False): | |
image = np.array(pil_img).astype(np.float32) / 255.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def normalize_latent(x, max_val, quantile_val): | |
x = x.detach().clone() | |
for i in range(x.shape[0]): | |
if x[[i], :].std() > 1.0: | |
x[[i], :] = x[[i], :] / x[[i], :].std() | |
s = torch.quantile(torch.abs(x[[i], :]), quantile_val) | |
s = torch.maximum(s, torch.ones_like(s) * max_val) | |
x[[i], :] = x[[i], :] / (s / max_val) | |
return x |