Last active
December 27, 2022 17:06
-
-
Save IzumiSatoshi/7392af83c078fa088fdc0ea12bdcaa29 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2022 The HuggingFace Team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import inspect | |
from typing import Callable, List, Optional, Union | |
import numpy as np | |
import torch | |
import PIL | |
from diffusers.utils import is_accelerate_available | |
from packaging import version | |
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer | |
from diffusers.configuration_utils import FrozenDict | |
from diffusers.models import AutoencoderKL, UNet2DConditionModel | |
from diffusers.pipeline_utils import DiffusionPipeline | |
from diffusers.schedulers import ( | |
DDIMScheduler, | |
DPMSolverMultistepScheduler, | |
EulerAncestralDiscreteScheduler, | |
EulerDiscreteScheduler, | |
LMSDiscreteScheduler, | |
PNDMScheduler, | |
) | |
from diffusers.utils import PIL_INTERPOLATION, deprecate, logging | |
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput | |
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker | |
from torchvision import transforms as tfms | |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name | |
def preprocess(image): | |
if isinstance(image, torch.Tensor): | |
return image | |
elif isinstance(image, PIL.Image.Image): | |
image = [image] | |
if isinstance(image[0], PIL.Image.Image): | |
w, h = image[0].size | |
w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 | |
image = [ | |
np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] | |
for i in image | |
] | |
image = np.concatenate(image, axis=0) | |
image = np.array(image).astype(np.float32) / 255.0 | |
image = image.transpose(0, 3, 1, 2) | |
image = 2.0 * image - 1.0 | |
image = torch.from_numpy(image) | |
elif isinstance(image[0], torch.Tensor): | |
image = torch.cat(image, dim=0) | |
return image | |
class StableDiffusionImg2ImgPipeline(DiffusionPipeline): | |
r""" | |
Pipeline for text-guided image to image generation using Stable Diffusion. | |
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the | |
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) | |
Args: | |
vae ([`AutoencoderKL`]): | |
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. | |
text_encoder ([`CLIPTextModel`]): | |
Frozen text-encoder. Stable Diffusion uses the text portion of | |
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically | |
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. | |
tokenizer (`CLIPTokenizer`): | |
Tokenizer of class | |
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). | |
unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. | |
scheduler ([`SchedulerMixin`]): | |
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of | |
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. | |
safety_checker ([`StableDiffusionSafetyChecker`]): | |
Classification module that estimates whether generated images could be considered offensive or harmful. | |
Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. | |
feature_extractor ([`CLIPFeatureExtractor`]): | |
Model that extracts features from generated images to be used as inputs for the `safety_checker`. | |
""" | |
_optional_components = ["safety_checker", "feature_extractor"] | |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__ | |
def __init__( | |
self, | |
vae: AutoencoderKL, | |
text_encoder: CLIPTextModel, | |
tokenizer: CLIPTokenizer, | |
unet: UNet2DConditionModel, | |
scheduler: Union[ | |
DDIMScheduler, | |
PNDMScheduler, | |
LMSDiscreteScheduler, | |
EulerDiscreteScheduler, | |
EulerAncestralDiscreteScheduler, | |
DPMSolverMultistepScheduler, | |
], | |
safety_checker: StableDiffusionSafetyChecker, | |
feature_extractor: CLIPFeatureExtractor, | |
requires_safety_checker: bool = True, | |
): | |
super().__init__() | |
if ( | |
hasattr(scheduler.config, "steps_offset") | |
and scheduler.config.steps_offset != 1 | |
): | |
deprecation_message = ( | |
f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" | |
f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " | |
"to update the config accordingly as leaving `steps_offset` might led to incorrect results" | |
" in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," | |
" it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" | |
" file" | |
) | |
deprecate( | |
"steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False | |
) | |
new_config = dict(scheduler.config) | |
new_config["steps_offset"] = 1 | |
scheduler._internal_dict = FrozenDict(new_config) | |
if ( | |
hasattr(scheduler.config, "clip_sample") | |
and scheduler.config.clip_sample is True | |
): | |
deprecation_message = ( | |
f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." | |
" `clip_sample` should be set to False in the configuration file. Please make sure to update the" | |
" config accordingly as not setting `clip_sample` in the config might lead to incorrect results in" | |
" future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" | |
" nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" | |
) | |
deprecate( | |
"clip_sample not set", "1.0.0", deprecation_message, standard_warn=False | |
) | |
new_config = dict(scheduler.config) | |
new_config["clip_sample"] = False | |
scheduler._internal_dict = FrozenDict(new_config) | |
if safety_checker is None and requires_safety_checker: | |
logger.warning( | |
f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" | |
" that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" | |
" results in services or applications open to the public. Both the diffusers team and Hugging Face" | |
" strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" | |
" it only for use-cases that involve analyzing network behavior or auditing its results. For more" | |
" information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." | |
) | |
if safety_checker is not None and feature_extractor is None: | |
raise ValueError( | |
"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" | |
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." | |
) | |
is_unet_version_less_0_9_0 = hasattr( | |
unet.config, "_diffusers_version" | |
) and version.parse( | |
version.parse(unet.config._diffusers_version).base_version | |
) < version.parse( | |
"0.9.0.dev0" | |
) | |
is_unet_sample_size_less_64 = ( | |
hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 | |
) | |
if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: | |
deprecation_message = ( | |
"The configuration file of the unet has set the default `sample_size` to smaller than" | |
" 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the" | |
" following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-" | |
" CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5" | |
" \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the" | |
" configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" | |
" in the config might lead to incorrect results in future versions. If you have downloaded this" | |
" checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" | |
" the `unet/config.json` file" | |
) | |
deprecate( | |
"sample_size<64", "1.0.0", deprecation_message, standard_warn=False | |
) | |
new_config = dict(unet.config) | |
new_config["sample_size"] = 64 | |
unet._internal_dict = FrozenDict(new_config) | |
self.register_modules( | |
vae=vae, | |
text_encoder=text_encoder, | |
tokenizer=tokenizer, | |
unet=unet, | |
scheduler=scheduler, | |
safety_checker=safety_checker, | |
feature_extractor=feature_extractor, | |
) | |
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) | |
self.register_to_config(requires_safety_checker=requires_safety_checker) | |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload | |
def enable_sequential_cpu_offload(self, gpu_id=0): | |
r""" | |
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, | |
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a | |
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. | |
""" | |
if is_accelerate_available(): | |
from accelerate import cpu_offload | |
else: | |
raise ImportError("Please install accelerate via `pip install accelerate`") | |
device = torch.device(f"cuda:{gpu_id}") | |
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: | |
if cpu_offloaded_model is not None: | |
cpu_offload(cpu_offloaded_model, device) | |
if self.safety_checker is not None: | |
# TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate | |
# fix by only offloading self.safety_checker for now | |
cpu_offload(self.safety_checker.vision_model, device) | |
@property | |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device | |
def _execution_device(self): | |
r""" | |
Returns the device on which the pipeline's models will be executed. After calling | |
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module | |
hooks. | |
""" | |
if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"): | |
return self.device | |
for module in self.unet.modules(): | |
if ( | |
hasattr(module, "_hf_hook") | |
and hasattr(module._hf_hook, "execution_device") | |
and module._hf_hook.execution_device is not None | |
): | |
return torch.device(module._hf_hook.execution_device) | |
return self.device | |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt | |
def _encode_prompt( | |
self, | |
prompt, | |
device, | |
num_images_per_prompt, | |
do_classifier_free_guidance, | |
negative_prompt, | |
): | |
r""" | |
Encodes the prompt into text encoder hidden states. | |
Args: | |
prompt (`str` or `list(int)`): | |
prompt to be encoded | |
device: (`torch.device`): | |
torch device | |
num_images_per_prompt (`int`): | |
number of images that should be generated per prompt | |
do_classifier_free_guidance (`bool`): | |
whether to use classifier free guidance or not | |
negative_prompt (`str` or `List[str]`): | |
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored | |
if `guidance_scale` is less than `1`). | |
""" | |
batch_size = len(prompt) if isinstance(prompt, list) else 1 | |
text_inputs = self.tokenizer( | |
prompt, | |
padding="max_length", | |
max_length=self.tokenizer.model_max_length, | |
truncation=True, | |
return_tensors="pt", | |
) | |
text_input_ids = text_inputs.input_ids | |
untruncated_ids = self.tokenizer( | |
prompt, padding="longest", return_tensors="pt" | |
).input_ids | |
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( | |
text_input_ids, untruncated_ids | |
): | |
removed_text = self.tokenizer.batch_decode( | |
untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] | |
) | |
logger.warning( | |
"The following part of your input was truncated because CLIP can only handle sequences up to" | |
f" {self.tokenizer.model_max_length} tokens: {removed_text}" | |
) | |
if ( | |
hasattr(self.text_encoder.config, "use_attention_mask") | |
and self.text_encoder.config.use_attention_mask | |
): | |
attention_mask = text_inputs.attention_mask.to(device) | |
else: | |
attention_mask = None | |
text_embeddings = self.text_encoder( | |
text_input_ids.to(device), | |
attention_mask=attention_mask, | |
) | |
text_embeddings = text_embeddings[0] | |
# duplicate text embeddings for each generation per prompt, using mps friendly method | |
bs_embed, seq_len, _ = text_embeddings.shape | |
text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1) | |
text_embeddings = text_embeddings.view( | |
bs_embed * num_images_per_prompt, seq_len, -1 | |
) | |
# get unconditional embeddings for classifier free guidance | |
if do_classifier_free_guidance: | |
uncond_tokens: List[str] | |
if negative_prompt is None: | |
uncond_tokens = [""] * batch_size | |
elif type(prompt) is not type(negative_prompt): | |
raise TypeError( | |
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" | |
f" {type(prompt)}." | |
) | |
elif isinstance(negative_prompt, str): | |
uncond_tokens = [negative_prompt] | |
elif batch_size != len(negative_prompt): | |
raise ValueError( | |
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" | |
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" | |
" the batch size of `prompt`." | |
) | |
else: | |
uncond_tokens = negative_prompt | |
max_length = text_input_ids.shape[-1] | |
uncond_input = self.tokenizer( | |
uncond_tokens, | |
padding="max_length", | |
max_length=max_length, | |
truncation=True, | |
return_tensors="pt", | |
) | |
if ( | |
hasattr(self.text_encoder.config, "use_attention_mask") | |
and self.text_encoder.config.use_attention_mask | |
): | |
attention_mask = uncond_input.attention_mask.to(device) | |
else: | |
attention_mask = None | |
uncond_embeddings = self.text_encoder( | |
uncond_input.input_ids.to(device), | |
attention_mask=attention_mask, | |
) | |
uncond_embeddings = uncond_embeddings[0] | |
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method | |
seq_len = uncond_embeddings.shape[1] | |
uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1) | |
uncond_embeddings = uncond_embeddings.view( | |
batch_size * num_images_per_prompt, seq_len, -1 | |
) | |
# For classifier free guidance, we need to do two forward passes. | |
# Here we concatenate the unconditional and text embeddings into a single batch | |
# to avoid doing two forward passes | |
text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) | |
return text_embeddings | |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker | |
def run_safety_checker(self, image, device, dtype): | |
if self.safety_checker is not None: | |
safety_checker_input = self.feature_extractor( | |
self.numpy_to_pil(image), return_tensors="pt" | |
).to(device) | |
image, has_nsfw_concept = self.safety_checker( | |
images=image, clip_input=safety_checker_input.pixel_values.to(dtype) | |
) | |
else: | |
has_nsfw_concept = None | |
return image, has_nsfw_concept | |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents | |
def decode_latents(self, latents): | |
latents = 1 / 0.18215 * latents | |
image = self.vae.decode(latents).sample | |
image = (image / 2 + 0.5).clamp(0, 1) | |
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16 | |
image = image.cpu().permute(0, 2, 3, 1).float().numpy() | |
return image | |
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs | |
def prepare_extra_step_kwargs(self, generator, eta): | |
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature | |
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. | |
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 | |
# and should be between [0, 1] | |
accepts_eta = "eta" in set( | |
inspect.signature(self.scheduler.step).parameters.keys() | |
) | |
extra_step_kwargs = {} | |
if accepts_eta: | |
extra_step_kwargs["eta"] = eta | |
# check if the scheduler accepts generator | |
accepts_generator = "generator" in set( | |
inspect.signature(self.scheduler.step).parameters.keys() | |
) | |
if accepts_generator: | |
extra_step_kwargs["generator"] = generator | |
return extra_step_kwargs | |
def check_inputs(self, prompt, strength, callback_steps): | |
if not isinstance(prompt, str) and not isinstance(prompt, list): | |
raise ValueError( | |
f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" | |
) | |
if strength < 0 or strength > 1: | |
raise ValueError( | |
f"The value of strength should in [1.0, 1.0] but is {strength}" | |
) | |
if (callback_steps is None) or ( | |
callback_steps is not None | |
and (not isinstance(callback_steps, int) or callback_steps <= 0) | |
): | |
raise ValueError( | |
f"`callback_steps` has to be a positive integer but is {callback_steps} of type" | |
f" {type(callback_steps)}." | |
) | |
def get_timesteps(self, num_inference_steps, strength, device): | |
# get the original timestep using init_timestep | |
init_timestep = min(int(num_inference_steps * strength), num_inference_steps) | |
t_start = max(num_inference_steps - init_timestep, 0) | |
timesteps = self.scheduler.timesteps[t_start:] | |
return timesteps, num_inference_steps - t_start | |
def prepare_latents( | |
self, | |
image, | |
timestep, | |
batch_size, | |
num_images_per_prompt, | |
dtype, | |
device, | |
generator=None, | |
): | |
image = image.to(device=device, dtype=dtype) | |
batch_size = batch_size * num_images_per_prompt | |
if isinstance(generator, list) and len(generator) != batch_size: | |
raise ValueError( | |
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" | |
f" size of {batch_size}. Make sure the batch size matches the length of the generators." | |
) | |
if isinstance(generator, list): | |
init_latents = [ | |
self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) | |
for i in range(batch_size) | |
] | |
init_latents = torch.cat(init_latents, dim=0) | |
else: | |
init_latents = self.vae.encode(image).latent_dist.sample(generator) | |
init_latents = 0.18215 * init_latents | |
if ( | |
batch_size > init_latents.shape[0] | |
and batch_size % init_latents.shape[0] == 0 | |
): | |
# expand init_latents for batch_size | |
deprecation_message = ( | |
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" | |
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note" | |
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" | |
" your script to pass as many initial images as text prompts to suppress this warning." | |
) | |
deprecate( | |
"len(prompt) != len(image)", | |
"1.0.0", | |
deprecation_message, | |
standard_warn=False, | |
) | |
additional_image_per_prompt = batch_size // init_latents.shape[0] | |
init_latents = torch.cat( | |
[init_latents] * additional_image_per_prompt, dim=0 | |
) | |
elif ( | |
batch_size > init_latents.shape[0] | |
and batch_size % init_latents.shape[0] != 0 | |
): | |
raise ValueError( | |
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." | |
) | |
else: | |
init_latents = torch.cat([init_latents], dim=0) | |
rand_device = "cpu" if device.type == "mps" else device | |
shape = init_latents.shape | |
if isinstance(generator, list): | |
shape = (1,) + shape[1:] | |
noise = [ | |
torch.randn( | |
shape, generator=generator[i], device=rand_device, dtype=dtype | |
) | |
for i in range(batch_size) | |
] | |
noise = torch.cat(noise, dim=0).to(device) | |
else: | |
noise = torch.randn( | |
shape, generator=generator, device=rand_device, dtype=dtype | |
).to(device) | |
# get latents | |
init_latents = self.scheduler.add_noise(init_latents, noise, timestep) | |
latents = init_latents | |
return latents | |
def latent_loss(self, latents, target_latents): | |
loss = torch.abs(latents - target_latents).mean() | |
return loss | |
def latents_to_pil(self, latents): | |
# bath of latents -> list of images | |
latents = (1 / 0.18215) * latents | |
with torch.no_grad(): | |
image = self.vae.decode(latents).sample | |
image = (image / 2 + 0.5).clamp(0, 1) | |
image = image.detach().cpu().permute(0, 2, 3, 1).numpy() | |
images = (image * 255).round().astype("uint8") | |
pil_images = [PIL.Image.fromarray(image) for image in images] | |
return pil_images | |
def pil_to_latent(self, input_im): | |
# Single image -> single latent in a batch (so size 1, 4, 64, 64) | |
# TODO: only work with fp16 | |
tensor = ( | |
tfms.ToTensor()(input_im).unsqueeze(0).to("cuda").half() * 2 - 1 | |
) # Note scaling | |
print(tensor.shape) | |
print(tensor) | |
with torch.no_grad(): | |
latent = self.vae.encode(tensor) | |
return 0.18215 * latent.latent_dist.sample() | |
def __call__( | |
self, | |
prompt: Union[str, List[str]], | |
image: Union[torch.FloatTensor, PIL.Image.Image] = None, | |
latent_target_image: Union[torch.FloatTensor, PIL.Image.Image] = None, | |
strength: float = 0.8, | |
latent_loss_scale: int = 100, | |
num_inference_steps: Optional[int] = 50, | |
guidance_scale: Optional[float] = 7.5, | |
negative_prompt: Optional[Union[str, List[str]]] = None, | |
num_images_per_prompt: Optional[int] = 1, | |
eta: Optional[float] = 0.0, | |
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, | |
output_type: Optional[str] = "pil", | |
return_dict: bool = True, | |
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, | |
callback_steps: Optional[int] = 1, | |
**kwargs, | |
): | |
# 1. Check inputs | |
self.check_inputs(prompt, strength, callback_steps) | |
# 2. Define call parameters | |
batch_size = 1 if isinstance(prompt, str) else len(prompt) | |
device = self._execution_device | |
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) | |
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` | |
# corresponds to doing no classifier free guidance. | |
do_classifier_free_guidance = guidance_scale > 1.0 | |
# 3. Encode input prompt | |
with torch.no_grad(): | |
text_embeddings = self._encode_prompt( | |
prompt, | |
device, | |
num_images_per_prompt, | |
do_classifier_free_guidance, | |
negative_prompt, | |
) | |
# 4. Preprocess image | |
image = preprocess(image) | |
# 5. set timesteps | |
self.scheduler.set_timesteps(num_inference_steps, device=device) | |
timesteps, num_inference_steps = self.get_timesteps( | |
num_inference_steps, strength, device | |
) | |
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) | |
# 6. Prepare latent variables | |
latents = self.prepare_latents( | |
image, | |
latent_timestep, | |
batch_size, | |
num_images_per_prompt, | |
text_embeddings.dtype, | |
device, | |
generator, | |
) | |
# TODO: don't use original functions | |
if latent_target_image is not None: | |
target_latents = self.pil_to_latent(latent_target_image) | |
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline | |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) | |
# 8. Denoising loop | |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order | |
with self.progress_bar(total=num_inference_steps) as progress_bar: | |
for i, t in enumerate(timesteps): | |
# expand the latents if we are doing classifier free guidance | |
latent_model_input = ( | |
torch.cat([latents] * 2) if do_classifier_free_guidance else latents | |
) | |
latent_model_input = self.scheduler.scale_model_input( | |
latent_model_input, t | |
) | |
# predict the noise residual | |
with torch.no_grad(): | |
noise_pred = self.unet( | |
latent_model_input, t, encoder_hidden_states=text_embeddings | |
).sample | |
# perform guidance | |
if do_classifier_free_guidance: | |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) | |
noise_pred = noise_pred_uncond + guidance_scale * ( | |
noise_pred_text - noise_pred_uncond | |
) | |
#### ADDITIONAL GUIDANCE ### | |
if i % 5 == 0 and latent_target_image is not None: | |
# Requires grad on the latents | |
latents = latents.detach().requires_grad_() | |
# Get the predicted x0: | |
# latents_x0 = latents - sigma * noise_pred | |
latents_x0 = self.scheduler.step( | |
noise_pred, t, latents | |
).pred_original_sample | |
# Calculate loss | |
loss = ( | |
self.latent_loss(latents_x0, target_latents) * latent_loss_scale | |
) | |
# Occasionally print it out | |
print(i, "loss:", loss.item()) | |
# Get gradient | |
cond_grad = torch.autograd.grad(loss, latents)[0] | |
# Modify the latents based on this gradient | |
latents = ( | |
latents.detach() - cond_grad # * self.scheduler.sigmas[i] ** 2 | |
) | |
# compute the previous noisy sample x_t -> x_t-1 | |
latents = self.scheduler.step( | |
noise_pred, t, latents, **extra_step_kwargs | |
).prev_sample | |
# call the callback, if provided | |
if i == len(timesteps) - 1 or ( | |
(i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 | |
): | |
progress_bar.update() | |
if callback is not None and i % callback_steps == 0: | |
callback(i, t, latents) | |
# TODO: don't use original functions | |
image = self.latents_to_pil(latents) | |
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None) | |
if __name__ == "__main__": | |
img = PIL.Image.open("/content/drive/MyDrive/data/Miku_dance/0095.png").resize( | |
(512, 512) | |
) | |
latent_target_img = PIL.Image.open( | |
"/content/drive/MyDrive/data/Miku_dance/0093.png" | |
).resize((512, 512)) | |
prompt = "Hatsune Miku, masterpiece, best quality, beautiful detailed, kawaii moe sexy bishoujo, 1girl, bare shoulders, upper body, dynamic pose, black outline, blush, sketch written by pencil" | |
negative_prompt = "bad anatomy, mutated hands, mutated, poorly, long arm, part of head, mutation, poorly drawn, malformed, text, 3d" | |
strength = 0.3 | |
latent_loss_scale = 3000 | |
pipe = StableDiffusionImg2ImgPipeline.from_pretrained( | |
"/content/drive/MyDrive/models/diffusers/waifu_diffusion_1_3" | |
).to("cuda") | |
out = pipe( | |
prompt=prompt, | |
negative_prompt=negative_prompt, | |
image=img, | |
latent_target_image=latent_target_img, | |
strength=strength, | |
latent_loss_scale=latent_loss_scale, | |
).images[0] | |
out.save("result.png") | |
print("done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment