-
-
Save Birch-san/230ac46f99ec411ed5907b0a3d728efa to your computer and use it in GitHub Desktop.
from torch import FloatTensor, LongTensor, Tensor, Size, lerp, zeros_like | |
from torch.linalg import norm | |
# adapted to PyTorch from: | |
# https://gist.github.com/dvschultz/3af50c40df002da3b751efab1daddf2c | |
# most of the extra complexity is to support: | |
# - many-dimensional vectors | |
# - v0 or v1 with last dim all zeroes, or v0 ~colinear with v1 | |
# - falls back to lerp() | |
# - conditional logic implemented with parallelism rather than Python loops | |
# - many-dimensional tensor for t | |
# - you can ask for batches of slerp outputs by making t more-dimensional than the vectors | |
# - slerp( | |
# v0: torch.Size([2,3]), | |
# v1: torch.Size([2,3]), | |
# t: torch.Size([4,1,1]), | |
# ) | |
# - this makes it interface-compatible with lerp() | |
def slerp(v0: FloatTensor, v1: FloatTensor, t: float|FloatTensor, DOT_THRESHOLD=0.9995): | |
''' | |
Spherical linear interpolation | |
Args: | |
v0: Starting vector | |
v1: Final vector | |
t: Float value between 0.0 and 1.0 | |
DOT_THRESHOLD: Threshold for considering the two vectors as | |
colinear. Not recommended to alter this. | |
Returns: | |
Interpolation vector between v0 and v1 | |
''' | |
assert v0.shape == v1.shape, "shapes of v0 and v1 must match" | |
# Normalize the vectors to get the directions and angles | |
v0_norm: FloatTensor = norm(v0, dim=-1) | |
v1_norm: FloatTensor = norm(v1, dim=-1) | |
v0_normed: FloatTensor = v0 / v0_norm.unsqueeze(-1) | |
v1_normed: FloatTensor = v1 / v1_norm.unsqueeze(-1) | |
# Dot product with the normalized vectors | |
dot: FloatTensor = (v0_normed * v1_normed).sum(-1) | |
dot_mag: FloatTensor = dot.abs() | |
# if dp is NaN, it's because the v0 or v1 row was filled with 0s | |
# If absolute value of dot product is almost 1, vectors are ~colinear, so use lerp | |
gotta_lerp: LongTensor = dot_mag.isnan() | (dot_mag > DOT_THRESHOLD) | |
can_slerp: LongTensor = ~gotta_lerp | |
t_batch_dim_count: int = max(0, t.dim()-v0.dim()) if isinstance(t, Tensor) else 0 | |
t_batch_dims: Size = t.shape[:t_batch_dim_count] if isinstance(t, Tensor) else Size([]) | |
out: FloatTensor = zeros_like(v0.expand(*t_batch_dims, *[-1]*v0.dim())) | |
# if no elements are lerpable, our vectors become 0-dimensional, preventing broadcasting | |
if gotta_lerp.any(): | |
lerped: FloatTensor = lerp(v0, v1, t) | |
out: FloatTensor = lerped.where(gotta_lerp.unsqueeze(-1), out) | |
# if no elements are slerpable, our vectors become 0-dimensional, preventing broadcasting | |
if can_slerp.any(): | |
# Calculate initial angle between v0 and v1 | |
theta_0: FloatTensor = dot.arccos().unsqueeze(-1) | |
sin_theta_0: FloatTensor = theta_0.sin() | |
# Angle at timestep t | |
theta_t: FloatTensor = theta_0 * t | |
sin_theta_t: FloatTensor = theta_t.sin() | |
# Finish the slerp algorithm | |
s0: FloatTensor = (theta_0 - theta_t).sin() / sin_theta_0 | |
s1: FloatTensor = sin_theta_t / sin_theta_0 | |
slerped: FloatTensor = s0 * v0 + s1 * v1 | |
out: FloatTensor = slerped.where(can_slerp.unsqueeze(-1), out) | |
return out |
hi @Birch-san ,
thanks for the snippet.
I found it when searching for ways to make my idea work. Also kudos for your technical blog, I see you have quite some hands-on experience with SD topics.
I'd like to implement two ideas, using FLUX's VAE :
- encode an image, sample from the distributions vector, decode using the VAE and hopefully get a variation of the initial image (one that is different to a human eye while retaining the same quality)
- encode two distinct images, compute an affine combination of their latents (or slerp: that's where your snippet came in), decode and hopefully get an in-between image. (btw I found no visible difference in output image between using linear or spherical interpolation)
These would be cheap ways, resource-wise, to generate images as they enable to skip the reverse diffusion.
None of these two worked well enough for me using the FLUX (dev or schnell) VAE:
- sampling yielded an identical image, because very close in the latent space I guess. Further perturbing the sample with more stds only brought artefacts (stds are very small relative to the means)
- While I did manage to get in-between images (see below), the result has artefacts, even when decoded with the VAE (thus to the full resolution)
Is this related to the fact that the KL loss weight of this VAE is relatively low, giving priority to reconstruction and therefore the interpolation might hit a no man's land in the latent space?
I'd like to understand more whether this is achievable or not and if so how.
A big thanks for your insight,
Clement
############################# results and snippet ######################################
below is my attempt at interpolating between a portrait without smile and one with smile: I do get a half smile, but when zooming in artefacts show up.
The artefacts on the interpolated image, looking like blur (here at the decoded resolution ie no upscale)
My code to get the interpolation above, using the diffusers API:
(code for first idea -sampling- is essentially doing latent_dist.sample()
)
(Note that in my actual use case, the latents will come from within the forward pass ie from the call to the pipeline, as opposed to artifically obtained by encoding an image)
import torch
from diffusers import AutoencoderKL
from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
from PIL import Image
from torchvision import transforms
from torchvision.transforms.functional import pil_to_tensor
smiling = <path_img1>
not_smiling = <path_img2>
vae: AutoencoderKL = AutoencoderKL.from_pretrained(
"black-forest-labs/FLUX.1-dev", subfolder="vae"
)
with torch.no_grad():
# vae:AutoencoderKL = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix")
vae.to(dtype=torch.float32) # otherwise it produces NaNs, even madebyollin's VAE
vae.to(device="cuda")
assert vae.device == torch.device("cuda:0")
assert vae.dtype == torch.float32
## 1) smile image
# make image as tensor
img = Image.open(smiling).resize((1024, 1024)).convert("RGB")
img_tensor = pil_to_tensor(img).unsqueeze(0) / 255.0
img_tensor = img_tensor.to(vae.device)
img_tensor = img_tensor.to(vae.dtype)
# get the inferred latent distribution
latent_dist: DiagonalGaussianDistribution = vae.encode(
img_tensor, return_dict=False
)[0]
print(
f"{latent_dist.mean.shape=} {latent_dist.std.shape=} {latent_dist.mean.mean()=} {latent_dist.std.mean()=}"
)
assert not latent_dist.mean.isnan().any()
assert not latent_dist.std.isnan().any()
assert latent_dist.deterministic is False
latent_smiling = (
latent_dist.mode()
) # .squeeze(0) # sample from the latent distribution
## 2) no smile image
# make image as tensor
img = Image.open(not_smiling).resize((1024, 1024)).convert("RGB")
img_tensor = pil_to_tensor(img).detach().cpu().unsqueeze(0) / 255.0
img_tensor = img_tensor.to(vae.device)
img_tensor = img_tensor.to(vae.dtype)
# get the inferred latent distribution
latent_dist: DiagonalGaussianDistribution = vae.encode(
img_tensor, return_dict=False
)[0]
print(
f"{latent_dist.mean.shape=} {latent_dist.std.shape=} {latent_dist.mean.mean()=} {latent_dist.std.mean()=}"
)
assert not latent_dist.mean.isnan().any()
assert not latent_dist.std.isnan().any()
assert latent_dist.deterministic is False
latent_not_smiling = (
latent_dist.mode()
) # .squeeze(0) # sample from the latent distribution
assert (latent_not_smiling != latent_smiling).any()
# avg then decode
t = 0.5
avg_latent = (
t * latent_smiling + (1 - t) * latent_not_smiling
) # linear interpolation
# avg_latent = slerp(latent_smiling, latent_not_smiling, t)
print(
f"{avg_latent.shape=} {avg_latent.mean()=} {avg_latent.std()=} {avg_latent.min()=} {avg_latent.max()=}"
)
image = vae.decode(avg_latent).sample.squeeze(0).cpu().detach()
image = transforms.ToPILImage()(image)
image.save("interpolated_.png")
Example invocation: