Skip to content

Instantly share code, notes, and snippets.

@IzumiSatoshi
Last active December 27, 2022 09:02
Show Gist options
  • Save IzumiSatoshi/cb27af355bc3d942ff0701013e05864b to your computer and use it in GitHub Desktop.
Save IzumiSatoshi/cb27af355bc3d942ff0701013e05864b to your computer and use it in GitHub Desktop.

code

unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")

print("unet down blocks num = ", len(unet.down_blocks))
for idx, down_block in enumerate(unet.down_blocks, 1):
  print('------', idx)
  print(down_block)

output

unet down blocks num =  4
------ 1
CrossAttnDownBlock2D(
  (attentions): ModuleList(
    (0): Transformer2DModel(
      (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
      (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
      (transformer_blocks): ModuleList(
        (0): BasicTransformerBlock(
          (attn1): CrossAttention(
            (to_q): Linear(in_features=320, out_features=320, bias=False)
            (to_k): Linear(in_features=320, out_features=320, bias=False)
            (to_v): Linear(in_features=320, out_features=320, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=320, out_features=320, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (ff): FeedForward(
            (net): ModuleList(
              (0): GEGLU(
                (proj): Linear(in_features=320, out_features=2560, bias=True)
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1280, out_features=320, bias=True)
            )
          )
          (attn2): CrossAttention(
            (to_q): Linear(in_features=320, out_features=320, bias=False)
            (to_k): Linear(in_features=768, out_features=320, bias=False)
            (to_v): Linear(in_features=768, out_features=320, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=320, out_features=320, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
        )
      )
      (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
    )
    (1): Transformer2DModel(
      (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
      (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
      (transformer_blocks): ModuleList(
        (0): BasicTransformerBlock(
          (attn1): CrossAttention(
            (to_q): Linear(in_features=320, out_features=320, bias=False)
            (to_k): Linear(in_features=320, out_features=320, bias=False)
            (to_v): Linear(in_features=320, out_features=320, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=320, out_features=320, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (ff): FeedForward(
            (net): ModuleList(
              (0): GEGLU(
                (proj): Linear(in_features=320, out_features=2560, bias=True)
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=1280, out_features=320, bias=True)
            )
          )
          (attn2): CrossAttention(
            (to_q): Linear(in_features=320, out_features=320, bias=False)
            (to_k): Linear(in_features=768, out_features=320, bias=False)
            (to_v): Linear(in_features=768, out_features=320, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=320, out_features=320, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
        )
      )
      (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
    )
  )
  (resnets): ModuleList(
    (0): ResnetBlock2D(
      (norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
      (conv1): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)
      (norm2): GroupNorm(32, 320, eps=1e-05, affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
      (conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (nonlinearity): SiLU()
    )
    (1): ResnetBlock2D(
      (norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
      (conv1): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)
      (norm2): GroupNorm(32, 320, eps=1e-05, affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
      (conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (nonlinearity): SiLU()
    )
  )
  (downsamplers): ModuleList(
    (0): Downsample2D(
      (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
  )
)
------ 2
CrossAttnDownBlock2D(
  (attentions): ModuleList(
    (0): Transformer2DModel(
      (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
      (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
      (transformer_blocks): ModuleList(
        (0): BasicTransformerBlock(
          (attn1): CrossAttention(
            (to_q): Linear(in_features=640, out_features=640, bias=False)
            (to_k): Linear(in_features=640, out_features=640, bias=False)
            (to_v): Linear(in_features=640, out_features=640, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=640, out_features=640, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (ff): FeedForward(
            (net): ModuleList(
              (0): GEGLU(
                (proj): Linear(in_features=640, out_features=5120, bias=True)
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=2560, out_features=640, bias=True)
            )
          )
          (attn2): CrossAttention(
            (to_q): Linear(in_features=640, out_features=640, bias=False)
            (to_k): Linear(in_features=768, out_features=640, bias=False)
            (to_v): Linear(in_features=768, out_features=640, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=640, out_features=640, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
          (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
        )
      )
      (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
    )
    (1): Transformer2DModel(
      (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
      (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
      (transformer_blocks): ModuleList(
        (0): BasicTransformerBlock(
          (attn1): CrossAttention(
            (to_q): Linear(in_features=640, out_features=640, bias=False)
            (to_k): Linear(in_features=640, out_features=640, bias=False)
            (to_v): Linear(in_features=640, out_features=640, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=640, out_features=640, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (ff): FeedForward(
            (net): ModuleList(
              (0): GEGLU(
                (proj): Linear(in_features=640, out_features=5120, bias=True)
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=2560, out_features=640, bias=True)
            )
          )
          (attn2): CrossAttention(
            (to_q): Linear(in_features=640, out_features=640, bias=False)
            (to_k): Linear(in_features=768, out_features=640, bias=False)
            (to_v): Linear(in_features=768, out_features=640, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=640, out_features=640, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
          (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
        )
      )
      (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
    )
  )
  (resnets): ModuleList(
    (0): ResnetBlock2D(
      (norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
      (conv1): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)
      (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
      (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (nonlinearity): SiLU()
      (conv_shortcut): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1))
    )
    (1): ResnetBlock2D(
      (norm1): GroupNorm(32, 640, eps=1e-05, affine=True)
      (conv1): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)
      (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
      (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (nonlinearity): SiLU()
    )
  )
  (downsamplers): ModuleList(
    (0): Downsample2D(
      (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
  )
)
------ 3
CrossAttnDownBlock2D(
  (attentions): ModuleList(
    (0): Transformer2DModel(
      (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
      (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
      (transformer_blocks): ModuleList(
        (0): BasicTransformerBlock(
          (attn1): CrossAttention(
            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
            (to_k): Linear(in_features=1280, out_features=1280, bias=False)
            (to_v): Linear(in_features=1280, out_features=1280, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=1280, out_features=1280, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (ff): FeedForward(
            (net): ModuleList(
              (0): GEGLU(
                (proj): Linear(in_features=1280, out_features=10240, bias=True)
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=5120, out_features=1280, bias=True)
            )
          )
          (attn2): CrossAttention(
            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
            (to_k): Linear(in_features=768, out_features=1280, bias=False)
            (to_v): Linear(in_features=768, out_features=1280, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=1280, out_features=1280, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
      )
      (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
    )
    (1): Transformer2DModel(
      (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
      (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
      (transformer_blocks): ModuleList(
        (0): BasicTransformerBlock(
          (attn1): CrossAttention(
            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
            (to_k): Linear(in_features=1280, out_features=1280, bias=False)
            (to_v): Linear(in_features=1280, out_features=1280, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=1280, out_features=1280, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (ff): FeedForward(
            (net): ModuleList(
              (0): GEGLU(
                (proj): Linear(in_features=1280, out_features=10240, bias=True)
              )
              (1): Dropout(p=0.0, inplace=False)
              (2): Linear(in_features=5120, out_features=1280, bias=True)
            )
          )
          (attn2): CrossAttention(
            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
            (to_k): Linear(in_features=768, out_features=1280, bias=False)
            (to_v): Linear(in_features=768, out_features=1280, bias=False)
            (to_out): ModuleList(
              (0): Linear(in_features=1280, out_features=1280, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
      )
      (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
    )
  )
  (resnets): ModuleList(
    (0): ResnetBlock2D(
      (norm1): GroupNorm(32, 640, eps=1e-05, affine=True)
      (conv1): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
      (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
      (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (nonlinearity): SiLU()
      (conv_shortcut): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
    )
    (1): ResnetBlock2D(
      (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
      (conv1): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
      (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
      (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (nonlinearity): SiLU()
    )
  )
  (downsamplers): ModuleList(
    (0): Downsample2D(
      (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
  )
)
------ 4
DownBlock2D(
  (resnets): ModuleList(
    (0): ResnetBlock2D(
      (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
      (conv1): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
      (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
      (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (nonlinearity): SiLU()
    )
    (1): ResnetBlock2D(
      (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)
      (conv1): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)
      (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
      (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (nonlinearity): SiLU()
    )
  )
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment