sayakpaul · December 9, 2024 07:35 · gau-nernst · Nov 21, 2024 · sayakpaul · Nov 21, 2024
diff --git a/cuda_streams_to.py b/cuda_streams_to.py
 """
 Code to demonstrate if async device transfers in `DiffusionPipeline` could be nice.

 Some numbers:

 A100 (80GB):

 * Flux.1 Dev: 1.40x
 * Flux.1 Dev ControlNet: 1.44x
 * Stable Diffusion 3: 1.04x
 * SDXL: 1.05x

 Colab:

 * SDXL: 1.22x (T4)
 * SDXL ControlNet: 1.2x (L4)

 Thanks to o1-mini for pairing 🍓!
 """

 import torch
 import torch.nn as nn
 from importlib import import_module
 import json
 from huggingface_hub import hf_hub_download
 from typing import List
 from diffusers import (
    DiffusionPipeline,
    ControlNetModel,
    StableDiffusionXLControlNetPipeline,
    FluxControlNetPipeline,
    FluxControlNetModel,
 )
 import fire

 CONTROLNET_MAPPING = {
    ControlNetModel: StableDiffusionXLControlNetPipeline,
    FluxControlNetModel: FluxControlNetPipeline,
 }


 class DiffusionPipelineAsync:
    def __init__(self, model_id="black-forest-labs/FLUX.1-dev", controlnet_id=None):
        if controlnet_id is None:
            self.pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
        else:
            config_path = hf_hub_download(controlnet_id, filename="config.json")
            with open(config_path, "r") as f:
                controlnet_cls_name = json.load(f)["_class_name"]

            contolnet_cls = getattr(import_module("diffusers"), controlnet_cls_name)
            controlnet = contolnet_cls.from_pretrained(controlnet_id, torch_dtype=torch.bfloat16)
            self.pipeline = CONTROLNET_MAPPING[contolnet_cls].from_pretrained(
                model_id, controlnet=controlnet, torch_dtype=torch.bfloat16
            )

        self.models = [module for _, module in self.pipeline.components.items() if isinstance(module, torch.nn.Module)]

    def _move_module_to_device(self, module: nn.Module, device: torch.device, stream: torch.cuda.Stream):
        """
        Moves a single module to the specified device using the given CUDA stream.
        """
        with torch.cuda.stream(stream):
            module.to(device, non_blocking=True)
        # Note: Synchronization is handled outside to allow overlap

    def to_cuda_async(self, device: torch.device):
        """
        Moves all child modules to the specified CUDA device using asynchronous transfers
        and multiple CUDA streams to accelerate the process.
        """
        if device.type != "cuda":
            raise ValueError("to_cuda_async should be used with a CUDA device.")

        # Ensure CUDA is available
        if not torch.cuda.is_available():
            raise RuntimeError("CUDA is not available.")

        # List of modules to move
        modules: List[nn.Module] = self.models

        # Create a CUDA stream for each module
        streams = [torch.cuda.Stream(device=device) for _ in modules]

        # Schedule each module transfer on its respective stream
        for module, stream in zip(modules, streams):
            self._move_module_to_device(module, device, stream)

        # Synchronize all streams to ensure all transfers are complete
        for stream in streams:
            stream.synchronize()

        return self

    def to_sequential(self, device: torch.device):
        """
        Moves all child modules to the specified device sequentially.
        """
        for model in self.models:
            model.to(device)
        return self

    def to_device(self, device: torch.device, use_fast_transfer: bool = False):
        """
        Configurable method to move all child modules to the specified device.
        Users can choose to use accelerated transfer with CUDA streams or standard sequential transfer.

        Parameters:
        - device (torch.device): The target device.
        - use_fast_transfer (bool): If True and device is CUDA, use accelerated transfer. Otherwise, use sequential.
        """
        if use_fast_transfer and device.type == "cuda":
            return self.to_cuda_async(device)
        else:
            return self.to_sequential(device)


 def time_transfer(pipeline: DiffusionPipeline, device: torch.device, use_fast_transfer: bool):
    torch.cuda.synchronize()  # Ensure all previous CUDA operations are complete
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    start_event.record()
    pipeline.to_device(device, use_fast_transfer=use_fast_transfer)
    end_event.record()

    # Wait for the events to be recorded
    torch.cuda.synchronize()

    elapsed_time_ms = start_event.elapsed_time(end_event)  # Time in milliseconds
    return elapsed_time_ms / 1000  # Convert to seconds


 def main(model_id: str = "black-forest-labs/FLUX.1-dev", controlnet_id: str = None):
    # Check if CUDA is available
    if not torch.cuda.is_available():
        print("CUDA is not available on this system. Exiting the demonstration.")
        return

    cuda_device = torch.device("cuda")
    cpu_device = torch.device("cpu")

    # Clear all CUDA cache.
    torch.cuda.empty_cache()

    # Initialize the pipeline
    pipeline = DiffusionPipelineAsync(model_id=model_id, controlnet_id=controlnet_id)

    # Warm-up transfers (optional but recommended for more consistent timing)
    print("Performing warm-up transfers...")
    pipeline.to_device(cuda_device, use_fast_transfer=True)
    pipeline.to_device(cuda_device, use_fast_transfer=False)

    # Ensure models are initially on CPU
    print("Ensuring models are on CPU...")
    pipeline.to_sequential(cpu_device)

    # Measure time for accelerated transfer
    print("\nStarting accelerated CUDA transfer...")
    accelerated_time = time_transfer(pipeline, cuda_device, use_fast_transfer=True)
    print(f"Accelerated CUDA transfer completed in {accelerated_time:.4f} seconds.")

    # Verify that models are on CUDA
    print("Verifying that all models are on CUDA after accelerated transfer...")
    for module in pipeline.models:
        for param in module.parameters():
            assert param.is_cuda, f"{module.__class__.__name__} is not on CUDA."
    print("All models are successfully on CUDA after accelerated transfer.")

    # Move models back to CPU for fair comparison
    print("\nMoving models back to CPU for standard transfer...")
    pipeline.to_sequential(cpu_device)

    # Measure time for standard sequential transfer
    print("Starting standard sequential CUDA transfer...")
    sequential_time = time_transfer(pipeline, cuda_device, use_fast_transfer=False)
    print(f"Standard sequential CUDA transfer completed in {sequential_time:.4f} seconds.")

    # Verify that models are on CUDA
    print("Verifying that all models are on CUDA after standard transfer...")
    for module in pipeline.models:
        for param in module.parameters():
            assert param.is_cuda, f"{module.__class__.__name__} is not on CUDA."
    print("All models are successfully on CUDA after accelerated transfer.")

    # Compare the two methods
    print("\nTransfer Time Comparison:")
    print(f"Accelerated CUDA Transfer Time: {accelerated_time:.4f} seconds")
    print(f"Standard Sequential Transfer Time: {sequential_time:.4f} seconds")

    if accelerated_time > 0:
        speedup = sequential_time / accelerated_time
        print(f"Speedup for ({model_id}): {speedup:.2f}x faster using accelerated transfer.")
    else:
        print(f"Accelerated transfer time is too small to calculate speedup for {model_id}.")


 if __name__ == "__main__":
    fire.Fire(main)
	"""
	Code to demonstrate if async device transfers in `DiffusionPipeline` could be nice.

	Some numbers:

	A100 (80GB):

	* Flux.1 Dev: 1.40x
	* Flux.1 Dev ControlNet: 1.44x
	* Stable Diffusion 3: 1.04x
	* SDXL: 1.05x

	Colab:

	* SDXL: 1.22x (T4)
	* SDXL ControlNet: 1.2x (L4)

	Thanks to o1-mini for pairing 🍓!
	"""

	import torch
	import torch.nn as nn
	from importlib import import_module
	import json
	from huggingface_hub import hf_hub_download
	from typing import List
	from diffusers import (
	DiffusionPipeline,
	ControlNetModel,
	StableDiffusionXLControlNetPipeline,
	FluxControlNetPipeline,
	FluxControlNetModel,
	)
	import fire

	CONTROLNET_MAPPING = {
	ControlNetModel: StableDiffusionXLControlNetPipeline,
	FluxControlNetModel: FluxControlNetPipeline,
	}


	class DiffusionPipelineAsync:
	def __init__(self, model_id="black-forest-labs/FLUX.1-dev", controlnet_id=None):
	if controlnet_id is None:
	self.pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
	else:
	config_path = hf_hub_download(controlnet_id, filename="config.json")
	with open(config_path, "r") as f:
	controlnet_cls_name = json.load(f)["_class_name"]

	contolnet_cls = getattr(import_module("diffusers"), controlnet_cls_name)
	controlnet = contolnet_cls.from_pretrained(controlnet_id, torch_dtype=torch.bfloat16)
	self.pipeline = CONTROLNET_MAPPING[contolnet_cls].from_pretrained(
	model_id, controlnet=controlnet, torch_dtype=torch.bfloat16
	)

	self.models = [module for _, module in self.pipeline.components.items() if isinstance(module, torch.nn.Module)]

	def _move_module_to_device(self, module: nn.Module, device: torch.device, stream: torch.cuda.Stream):
	"""
	Moves a single module to the specified device using the given CUDA stream.
	"""
	with torch.cuda.stream(stream):
	module.to(device, non_blocking=True)
	# Note: Synchronization is handled outside to allow overlap

	def to_cuda_async(self, device: torch.device):
	"""
	Moves all child modules to the specified CUDA device using asynchronous transfers
	and multiple CUDA streams to accelerate the process.
	"""
	if device.type != "cuda":
	raise ValueError("to_cuda_async should be used with a CUDA device.")

	# Ensure CUDA is available
	if not torch.cuda.is_available():
	raise RuntimeError("CUDA is not available.")

	# List of modules to move
	modules: List[nn.Module] = self.models

	# Create a CUDA stream for each module
	streams = [torch.cuda.Stream(device=device) for _ in modules]

	# Schedule each module transfer on its respective stream
	for module, stream in zip(modules, streams):
	self._move_module_to_device(module, device, stream)

	# Synchronize all streams to ensure all transfers are complete
	for stream in streams:
	stream.synchronize()

	return self

	def to_sequential(self, device: torch.device):
	"""
	Moves all child modules to the specified device sequentially.
	"""
	for model in self.models:
	model.to(device)
	return self

	def to_device(self, device: torch.device, use_fast_transfer: bool = False):
	"""
	Configurable method to move all child modules to the specified device.
	Users can choose to use accelerated transfer with CUDA streams or standard sequential transfer.

	Parameters:
	- device (torch.device): The target device.
	- use_fast_transfer (bool): If True and device is CUDA, use accelerated transfer. Otherwise, use sequential.
	"""
	if use_fast_transfer and device.type == "cuda":
	return self.to_cuda_async(device)
	else:
	return self.to_sequential(device)


	def time_transfer(pipeline: DiffusionPipeline, device: torch.device, use_fast_transfer: bool):
	torch.cuda.synchronize() # Ensure all previous CUDA operations are complete
	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)

	start_event.record()
	pipeline.to_device(device, use_fast_transfer=use_fast_transfer)
	end_event.record()

	# Wait for the events to be recorded
	torch.cuda.synchronize()

	elapsed_time_ms = start_event.elapsed_time(end_event) # Time in milliseconds
	return elapsed_time_ms / 1000 # Convert to seconds


	def main(model_id: str = "black-forest-labs/FLUX.1-dev", controlnet_id: str = None):
	# Check if CUDA is available
	if not torch.cuda.is_available():
	print("CUDA is not available on this system. Exiting the demonstration.")
	return

	cuda_device = torch.device("cuda")
	cpu_device = torch.device("cpu")

	# Clear all CUDA cache.
	torch.cuda.empty_cache()

	# Initialize the pipeline
	pipeline = DiffusionPipelineAsync(model_id=model_id, controlnet_id=controlnet_id)

	# Warm-up transfers (optional but recommended for more consistent timing)
	print("Performing warm-up transfers...")
	pipeline.to_device(cuda_device, use_fast_transfer=True)
	pipeline.to_device(cuda_device, use_fast_transfer=False)

	# Ensure models are initially on CPU
	print("Ensuring models are on CPU...")
	pipeline.to_sequential(cpu_device)

	# Measure time for accelerated transfer
	print("\nStarting accelerated CUDA transfer...")
	accelerated_time = time_transfer(pipeline, cuda_device, use_fast_transfer=True)
	print(f"Accelerated CUDA transfer completed in {accelerated_time:.4f} seconds.")

	# Verify that models are on CUDA
	print("Verifying that all models are on CUDA after accelerated transfer...")
	for module in pipeline.models:
	for param in module.parameters():
	assert param.is_cuda, f"{module.__class__.__name__} is not on CUDA."
	print("All models are successfully on CUDA after accelerated transfer.")

	# Move models back to CPU for fair comparison
	print("\nMoving models back to CPU for standard transfer...")
	pipeline.to_sequential(cpu_device)

	# Measure time for standard sequential transfer
	print("Starting standard sequential CUDA transfer...")
	sequential_time = time_transfer(pipeline, cuda_device, use_fast_transfer=False)
	print(f"Standard sequential CUDA transfer completed in {sequential_time:.4f} seconds.")

	# Verify that models are on CUDA
	print("Verifying that all models are on CUDA after standard transfer...")
	for module in pipeline.models:
	for param in module.parameters():
	assert param.is_cuda, f"{module.__class__.__name__} is not on CUDA."
	print("All models are successfully on CUDA after accelerated transfer.")

	# Compare the two methods
	print("\nTransfer Time Comparison:")
	print(f"Accelerated CUDA Transfer Time: {accelerated_time:.4f} seconds")
	print(f"Standard Sequential Transfer Time: {sequential_time:.4f} seconds")

	if accelerated_time > 0:
	speedup = sequential_time / accelerated_time
	print(f"Speedup for ({model_id}): {speedup:.2f}x faster using accelerated transfer.")
	else:
	print(f"Accelerated transfer time is too small to calculate speedup for {model_id}.")


	if __name__ == "__main__":
	fire.Fire(main)