Created
December 20, 2023 10:30
-
-
Save HDCharles/5da8093137ea7b05dd59a4a3bb1f67a5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/home/cdhernandez/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. | |
_torch_pytree._register_pytree_node( | |
/home/cdhernandez/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. | |
_torch_pytree._register_pytree_node( | |
/home/cdhernandez/local/diffusers/src/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. | |
torch.utils._pytree._register_pytree_node( | |
Namespace(no_bf16=False, no_sdpa=False, batch_size=1, num_inference_steps=30, enable_fused_projections=True, upcast_vae=False, compile_unet=True, compile_vae=True, compile_mode='max-autotune', change_comp_config=True, do_quant='int8dynamic', tag='branch12-1-all') | |
Using dtype: torch.bfloat16 | |
Loading pipeline components...: 0%| | 0/7 [00:00<?, ?it/s] | |
Loading pipeline components...: 14%|█▍ | 1/7 [00:01<00:07, 1.17s/it] | |
Loading pipeline components...: 29%|██▊ | 2/7 [00:01<00:03, 1.33it/s] | |
Loading pipeline components...: 57%|█████▋ | 4/7 [00:01<00:01, 2.76it/s]/home/cdhernandez/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.0 | |
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" | |
/home/cdhernandez/local/diffusers/src/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. | |
torch.utils._pytree._register_pytree_node( | |
Loading pipeline components...: 71%|███████▏ | 5/7 [00:02<00:00, 3.03it/s] | |
Loading pipeline components...: 100%|██████████| 7/7 [00:02<00:00, 4.63it/s] | |
Loading pipeline components...: 100%|██████████| 7/7 [00:02<00:00, 3.01it/s] | |
Enabling fused QKV projections for both UNet and VAE. | |
Compile UNet | |
Apply quantization to UNet | |
Compile VAE | |
Apply quantization to VAE | |
AUTOTUNE convolution(2x4x128x128, 320x4x3x3) | |
triton_convolution_1 0.0418 ms 100.0% | |
triton_convolution_6 0.0446 ms 93.7% | |
triton_convolution_3 0.0511 ms 81.8% | |
triton_convolution_4 0.0520 ms 80.4% | |
triton_convolution_0 0.0576 ms 72.6% | |
triton_convolution_5 0.0636 ms 65.7% | |
triton_convolution_2 0.0969 ms 43.1% | |
convolution 0.1146 ms 36.5% | |
SingleProcess AUTOTUNE takes 3.1765 seconds | |
AUTOTUNE convolution(2x320x128x128, 320x320x3x3) | |
convolution 0.3249 ms 100.0% | |
triton_convolution_11 0.5396 ms 60.2% | |
triton_convolution_8 0.5785 ms 56.2% | |
triton_convolution_13 0.6549 ms 49.6% | |
triton_convolution_10 0.7101 ms 45.8% | |
triton_convolution_7 0.8444 ms 38.5% | |
triton_convolution_12 0.8534 ms 38.1% | |
triton_convolution_9 3.4460 ms 9.4% | |
SingleProcess AUTOTUNE takes 1.1167 seconds | |
AUTOTUNE int_mm(2x320, 320x1280, 2x1280) | |
triton_mm_19 0.0089 ms 100.0% | |
triton_mm_20 0.0091 ms 97.5% | |
triton_mm_24 0.0091 ms 97.5% | |
triton_mm_22 0.0093 ms 95.4% | |
triton_mm_18 0.0093 ms 95.2% | |
triton_mm_23 0.0103 ms 86.0% | |
triton_mm_16 0.0105 ms 84.2% | |
triton_mm_15 0.0117 ms 75.7% | |
triton_mm_17 0.0129 ms 68.7% | |
triton_mm_14 0.0134 ms 66.1% | |
SingleProcess AUTOTUNE takes 1.3878 seconds | |
AUTOTUNE int_mm(2x1280, 1280x1280, 2x1280) | |
triton_mm_35 0.0136 ms 100.0% | |
triton_mm_30 0.0158 ms 86.4% | |
triton_mm_34 0.0158 ms 86.4% | |
triton_mm_31 0.0163 ms 83.9% | |
triton_mm_33 0.0164 ms 83.2% | |
triton_mm_29 0.0170 ms 80.1% | |
triton_mm_27 0.0232 ms 58.7% | |
triton_mm_26 0.0259 ms 52.7% | |
triton_mm_25 0.0359 ms 38.0% | |
triton_mm_28 0.0363 ms 37.6% | |
SingleProcess AUTOTUNE takes 1.3811 seconds | |
AUTOTUNE int_mm(2x2816, 2816x1280, 2x1280) | |
triton_mm_46 0.0208 ms 100.0% | |
triton_mm_45 0.0235 ms 88.6% | |
triton_mm_41 0.0247 ms 84.2% | |
triton_mm_42 0.0263 ms 79.0% | |
triton_mm_44 0.0265 ms 78.4% | |
triton_mm_40 0.0288 ms 72.1% | |
triton_mm_38 0.0423 ms 49.2% | |
triton_mm_37 0.0470 ms 44.3% | |
triton_mm_36 0.0718 ms 29.0% | |
triton_mm_39 0.0729 ms 28.5% | |
SingleProcess AUTOTUNE takes 1.3752 seconds | |
AUTOTUNE int_mm(2x1280, 1280x320, 2x320) | |
triton_mm_68 0.0129 ms 100.0% | |
triton_mm_63 0.0141 ms 91.6% | |
triton_mm_64 0.0148 ms 87.3% | |
triton_mm_66 0.0151 ms 85.6% | |
triton_mm_67 0.0153 ms 84.3% | |
triton_mm_62 0.0160 ms 80.6% | |
triton_mm_60 0.0222 ms 58.2% | |
triton_mm_59 0.0242 ms 53.5% | |
triton_mm_58 0.0321 ms 40.3% | |
triton_mm_61 0.0325 ms 39.7% | |
SingleProcess AUTOTUNE takes 1.9067 seconds | |
AUTOTUNE convolution(2x320x128x128, 320x320x3x3) | |
convolution 0.0941 ms 100.0% | |
triton_convolution_105 0.1793 ms 52.5% | |
triton_convolution_102 0.1987 ms 47.4% | |
triton_convolution_104 0.2207 ms 42.7% | |
triton_convolution_107 0.2303 ms 40.9% | |
triton_convolution_106 0.3008 ms 31.3% | |
triton_convolution_101 0.3378 ms 27.9% | |
triton_convolution_103 1.2140 ms 7.8% | |
SingleProcess AUTOTUNE takes 1.0416 seconds | |
AUTOTUNE convolution(2x320x64x64, 640x320x3x3) | |
convolution 0.1487 ms 100.0% | |
triton_convolution_112 0.2756 ms 54.0% | |
triton_convolution_114 0.3167 ms 47.0% | |
triton_convolution_111 0.3210 ms 46.3% | |
triton_convolution_109 0.3650 ms 40.7% | |
triton_convolution_108 0.3677 ms 40.5% | |
triton_convolution_113 0.4048 ms 36.7% | |
triton_convolution_110 1.4703 ms 10.1% | |
SingleProcess AUTOTUNE takes 1.0584 seconds | |
AUTOTUNE int_mm(2x1280, 1280x640, 2x640) | |
triton_mm_125 0.0136 ms 100.0% | |
triton_mm_120 0.0146 ms 93.0% | |
triton_mm_123 0.0151 ms 89.8% | |
triton_mm_124 0.0155 ms 87.5% | |
triton_mm_121 0.0157 ms 86.2% | |
triton_mm_119 0.0163 ms 83.1% | |
triton_mm_117 0.0227 ms 59.8% | |
triton_mm_116 0.0249 ms 54.5% | |
triton_mm_115 0.0347 ms 39.2% | |
triton_mm_118 0.0347 ms 39.1% | |
SingleProcess AUTOTUNE takes 1.3933 seconds | |
AUTOTUNE int_mm(8192x320, 320x640, 8192x640) | |
triton_mm_134 0.0377 ms 100.0% | |
triton_mm_127 0.0423 ms 89.2% | |
triton_mm_129 0.0448 ms 84.1% | |
triton_mm_126 0.0473 ms 79.8% | |
triton_mm_135 0.0494 ms 76.4% | |
triton_mm_130 0.0518 ms 72.8% | |
triton_mm_128 0.0527 ms 71.6% | |
triton_mm_133 0.0536 ms 70.3% | |
triton_mm_132 0.0688 ms 54.9% | |
triton_mm_131 0.0753 ms 50.1% | |
SingleProcess AUTOTUNE takes 1.5121 seconds | |
AUTOTUNE convolution(2x640x64x64, 640x640x3x3) | |
convolution 0.2730 ms 100.0% | |
triton_convolution_141 0.5305 ms 51.4% | |
triton_convolution_140 0.6249 ms 43.7% | |
triton_convolution_143 0.6435 ms 42.4% | |
triton_convolution_138 0.6932 ms 39.4% | |
triton_convolution_137 0.7724 ms 35.3% | |
triton_convolution_142 0.7960 ms 34.3% | |
triton_convolution_139 2.9231 ms 9.3% | |
SingleProcess AUTOTUNE takes 1.1135 seconds | |
AUTOTUNE int_mm(8192x640, 640x640, 8192x640) | |
triton_mm_152 0.0526 ms 100.0% | |
triton_mm_153 0.0550 ms 95.8% | |
triton_mm_145 0.0570 ms 92.4% | |
triton_mm_147 0.0629 ms 83.7% | |
triton_mm_146 0.0643 ms 81.9% | |
triton_mm_144 0.0677 ms 77.8% | |
triton_mm_148 0.0693 ms 75.9% | |
triton_mm_151 0.0723 ms 72.9% | |
triton_mm_154 0.0942 ms 55.9% | |
triton_mm_150 0.1254 ms 42.0% | |
SingleProcess AUTOTUNE takes 1.3675 seconds | |
AUTOTUNE int_mm(8192x640, 640x1920, 8192x1920) | |
triton_mm_164 0.1288 ms 100.0% | |
triton_mm_156 0.1369 ms 94.1% | |
triton_mm_163 0.1411 ms 91.2% | |
triton_mm_158 0.1625 ms 79.2% | |
triton_mm_157 0.1669 ms 77.1% | |
triton_mm_155 0.1746 ms 73.8% | |
triton_mm_162 0.1783 ms 72.2% | |
triton_mm_159 0.1910 ms 67.4% | |
triton_mm_165 0.2252 ms 57.2% | |
triton_mm_161 0.3608 ms 35.7% | |
SingleProcess AUTOTUNE takes 1.4146 seconds | |
AUTOTUNE int_mm(154x2048, 2048x1280, 154x1280) | |
triton_mm_183 0.0231 ms 100.0% | |
triton_mm_185 0.0244 ms 94.6% | |
triton_mm_180 0.0345 ms 67.1% | |
triton_mm_182 0.0349 ms 66.3% | |
triton_mm_181 0.0356 ms 65.0% | |
triton_mm_178 0.0427 ms 54.2% | |
triton_mm_179 0.0436 ms 53.1% | |
triton_mm_186 0.0467 ms 49.6% | |
triton_mm_177 0.0506 ms 45.8% | |
triton_mm_187 0.0670 ms 34.5% | |
SingleProcess AUTOTUNE takes 1.5401 seconds | |
AUTOTUNE int_mm(8192x640, 640x5120, 8192x5120) | |
triton_mm_219 0.2933 ms 100.0% | |
triton_mm_211 0.3344 ms 87.7% | |
triton_mm_218 0.3532 ms 83.0% | |
triton_mm_212 0.4108 ms 71.4% | |
triton_mm_213 0.4136 ms 70.9% | |
triton_mm_210 0.4190 ms 70.0% | |
triton_mm_217 0.4348 ms 67.5% | |
triton_mm_214 0.4873 ms 60.2% | |
triton_mm_220 0.5299 ms 55.3% | |
triton_mm_216 0.9388 ms 31.2% | |
SingleProcess AUTOTUNE takes 1.4768 seconds | |
AUTOTUNE int_mm(8192x2560, 2560x640, 8192x640) | |
triton_mm_230 0.1039 ms 100.0% | |
triton_mm_229 0.1402 ms 74.1% | |
triton_mm_231 0.1450 ms 71.6% | |
triton_mm_222 0.1513 ms 68.6% | |
triton_mm_223 0.1589 ms 65.4% | |
triton_mm_224 0.1629 ms 63.8% | |
triton_mm_225 0.1655 ms 62.8% | |
triton_mm_228 0.1875 ms 55.4% | |
triton_mm_221 0.1988 ms 52.2% | |
triton_mm_227 0.4100 ms 25.3% | |
SingleProcess AUTOTUNE takes 1.4169 seconds | |
AUTOTUNE convolution(2x640x64x64, 640x640x3x3) | |
convolution 0.0981 ms 100.0% | |
triton_convolution_525 0.1550 ms 63.3% | |
triton_convolution_524 0.2222 ms 44.1% | |
triton_convolution_526 0.2342 ms 41.9% | |
triton_convolution_527 0.2379 ms 41.2% | |
triton_convolution_522 0.3626 ms 27.0% | |
triton_convolution_521 0.3796 ms 25.8% | |
triton_convolution_523 1.2408 ms 7.9% | |
SingleProcess AUTOTUNE takes 1.0429 seconds | |
AUTOTUNE convolution(2x640x32x32, 1280x640x3x3) | |
convolution 0.1744 ms 100.0% | |
triton_convolution_532 0.3172 ms 55.0% | |
triton_convolution_529 0.3567 ms 48.9% | |
triton_convolution_528 0.3946 ms 44.2% | |
triton_convolution_531 0.4126 ms 42.3% | |
triton_convolution_534 0.4216 ms 41.4% | |
triton_convolution_533 0.4371 ms 39.9% | |
triton_convolution_530 1.9595 ms 8.9% | |
SingleProcess AUTOTUNE takes 1.0733 seconds | |
AUTOTUNE int_mm(2048x640, 640x1280, 2048x1280) | |
triton_mm_554 0.0301 ms 100.0% | |
triton_mm_555 0.0307 ms 98.1% | |
triton_mm_547 0.0348 ms 86.6% | |
triton_mm_549 0.0373 ms 80.8% | |
triton_mm_548 0.0389 ms 77.4% | |
triton_mm_550 0.0398 ms 75.6% | |
triton_mm_546 0.0400 ms 75.3% | |
triton_mm_553 0.0437 ms 68.9% | |
triton_mm_556 0.0500 ms 60.2% | |
triton_mm_552 0.0671 ms 44.9% | |
SingleProcess AUTOTUNE takes 1.3871 seconds | |
AUTOTUNE convolution(2x1280x32x32, 1280x1280x3x3) | |
convolution 0.3079 ms 100.0% | |
triton_convolution_561 0.6073 ms 50.7% | |
triton_convolution_558 0.6965 ms 44.2% | |
triton_convolution_557 0.7700 ms 40.0% | |
triton_convolution_560 0.7927 ms 38.8% | |
triton_convolution_562 0.8364 ms 36.8% | |
triton_convolution_563 0.8472 ms 36.3% | |
triton_convolution_559 3.9133 ms 7.9% | |
SingleProcess AUTOTUNE takes 1.1280 seconds | |
AUTOTUNE int_mm(2048x1280, 1280x1280, 2048x1280) | |
triton_mm_573 0.0384 ms 100.0% | |
triton_mm_572 0.0451 ms 85.0% | |
triton_mm_565 0.0516 ms 74.3% | |
triton_mm_567 0.0530 ms 72.4% | |
triton_mm_566 0.0538 ms 71.4% | |
triton_mm_568 0.0546 ms 70.3% | |
triton_mm_574 0.0580 ms 66.1% | |
triton_mm_564 0.0613 ms 62.5% | |
triton_mm_571 0.0622 ms 61.6% | |
triton_mm_570 0.1146 ms 33.5% | |
SingleProcess AUTOTUNE takes 1.3678 seconds | |
AUTOTUNE int_mm(2048x1280, 1280x3840, 2048x3840) | |
triton_mm_584 0.1022 ms 100.0% | |
triton_mm_583 0.1186 ms 86.2% | |
triton_mm_576 0.1204 ms 84.9% | |
triton_mm_578 0.1333 ms 76.7% | |
triton_mm_577 0.1367 ms 74.7% | |
triton_mm_579 0.1486 ms 68.7% | |
triton_mm_575 0.1607 ms 63.6% | |
triton_mm_585 0.1610 ms 63.5% | |
triton_mm_582 0.1614 ms 63.3% | |
triton_mm_581 0.3260 ms 31.3% | |
SingleProcess AUTOTUNE takes 1.3954 seconds | |
AUTOTUNE int_mm(154x2048, 2048x2560, 154x2560) | |
triton_mm_605 0.0276 ms 100.0% | |
triton_mm_603 0.0343 ms 80.5% | |
triton_mm_600 0.0351 ms 78.6% | |
triton_mm_601 0.0365 ms 75.6% | |
triton_mm_598 0.0435 ms 63.5% | |
triton_mm_599 0.0452 ms 61.1% | |
triton_mm_606 0.0470 ms 58.7% | |
triton_mm_602 0.0481 ms 57.4% | |
triton_mm_597 0.0530 ms 52.1% | |
triton_mm_607 0.0674 ms 41.0% | |
SingleProcess AUTOTUNE takes 1.3794 seconds | |
AUTOTUNE int_mm(2048x1280, 1280x10240, 2048x10240) | |
triton_mm_639 0.2024 ms 100.0% | |
triton_mm_631 0.2907 ms 69.6% | |
triton_mm_638 0.2976 ms 68.0% | |
triton_mm_640 0.3229 ms 62.7% | |
triton_mm_632 0.3306 ms 61.2% | |
triton_mm_633 0.3386 ms 59.8% | |
triton_mm_637 0.3525 ms 57.4% | |
triton_mm_634 0.3800 ms 53.3% | |
triton_mm_630 0.3859 ms 52.5% | |
triton_mm_636 0.8457 ms 23.9% | |
SingleProcess AUTOTUNE takes 1.4646 seconds | |
AUTOTUNE int_mm(2048x5120, 5120x1280, 2048x1280) | |
triton_mm_650 0.0870 ms 100.0% | |
triton_mm_651 0.1096 ms 79.4% | |
triton_mm_649 0.1324 ms 65.7% | |
triton_mm_645 0.1573 ms 55.3% | |
triton_mm_642 0.1580 ms 55.1% | |
triton_mm_643 0.1592 ms 54.7% | |
triton_mm_644 0.1604 ms 54.3% | |
triton_mm_648 0.1761 ms 49.4% | |
triton_mm_641 0.1991 ms 43.7% | |
triton_mm_647 0.3942 ms 22.1% | |
SingleProcess AUTOTUNE takes 1.3989 seconds | |
AUTOTUNE convolution(2x2560x32x32, 1280x2560x3x3) | |
convolution 0.6057 ms 100.0% | |
triton_convolution_3019 1.2098 ms 50.1% | |
triton_convolution_3016 1.3744 ms 44.1% | |
triton_convolution_3015 1.5249 ms 39.7% | |
triton_convolution_3018 1.5631 ms 38.7% | |
triton_convolution_3020 1.6504 ms 36.7% | |
triton_convolution_3021 1.6729 ms 36.2% | |
triton_convolution_3017 7.9157 ms 7.7% | |
SingleProcess AUTOTUNE takes 1.1892 seconds | |
AUTOTUNE int_mm(2048x2560, 2560x1280, 2048x1280) | |
triton_mm_3042 0.0543 ms 100.0% | |
triton_mm_3041 0.0741 ms 73.3% | |
triton_mm_3043 0.0756 ms 71.9% | |
triton_mm_3034 0.0830 ms 65.4% | |
triton_mm_3036 0.0853 ms 63.7% | |
triton_mm_3035 0.0860 ms 63.2% | |
triton_mm_3037 0.0867 ms 62.6% | |
triton_mm_3040 0.1003 ms 54.2% | |
triton_mm_3033 0.1055 ms 51.5% | |
triton_mm_3039 0.2075 ms 26.2% | |
SingleProcess AUTOTUNE takes 1.3698 seconds | |
AUTOTUNE convolution(2x1920x32x32, 1280x1920x3x3) | |
convolution 0.4552 ms 100.0% | |
triton_convolution_4675 0.9072 ms 50.2% | |
triton_convolution_4672 1.0384 ms 43.8% | |
triton_convolution_4671 1.1436 ms 39.8% | |
triton_convolution_4674 1.1725 ms 38.8% | |
triton_convolution_4676 1.2320 ms 37.0% | |
triton_convolution_4677 1.2503 ms 36.4% | |
triton_convolution_4673 5.9112 ms 7.7% | |
SingleProcess AUTOTUNE takes 1.1602 seconds | |
AUTOTUNE int_mm(2048x1920, 1920x1280, 2048x1280) | |
triton_mm_4698 0.0464 ms 100.0% | |
triton_mm_4697 0.0592 ms 78.5% | |
triton_mm_4690 0.0655 ms 70.9% | |
triton_mm_4699 0.0674 ms 68.9% | |
triton_mm_4692 0.0678 ms 68.5% | |
triton_mm_4691 0.0702 ms 66.1% | |
triton_mm_4693 0.0706 ms 65.8% | |
triton_mm_4689 0.0797 ms 58.3% | |
triton_mm_4696 0.0813 ms 57.1% | |
triton_mm_4695 0.1601 ms 29.0% | |
SingleProcess AUTOTUNE takes 1.5055 seconds | |
AUTOTUNE convolution(2x1280x64x64, 1280x1280x3x3) | |
convolution 1.0999 ms 100.0% | |
triton_convolution_5503 1.9182 ms 57.3% | |
triton_convolution_5500 2.1292 ms 51.7% | |
triton_convolution_5502 2.3252 ms 47.3% | |
triton_convolution_5499 2.3532 ms 46.7% | |
triton_convolution_5504 2.4464 ms 45.0% | |
triton_convolution_5505 2.4526 ms 44.8% | |
triton_convolution_5501 15.6338 ms 7.0% | |
SingleProcess AUTOTUNE takes 1.2599 seconds | |
AUTOTUNE convolution(2x1920x64x64, 640x1920x3x3) | |
convolution 0.8206 ms 100.0% | |
triton_convolution_5510 1.7079 ms 48.0% | |
triton_convolution_5509 1.8124 ms 45.3% | |
triton_convolution_5512 1.9461 ms 42.2% | |
triton_convolution_5507 2.1385 ms 38.4% | |
triton_convolution_5506 2.3009 ms 35.7% | |
triton_convolution_5511 2.3164 ms 35.4% | |
triton_convolution_5508 12.5700 ms 6.5% | |
SingleProcess AUTOTUNE takes 1.2300 seconds | |
AUTOTUNE int_mm(8192x1920, 1920x640, 8192x640) | |
triton_mm_5533 0.0874 ms 100.0% | |
triton_mm_5532 0.1096 ms 79.7% | |
triton_mm_5525 0.1211 ms 72.2% | |
triton_mm_5534 0.1279 ms 68.3% | |
triton_mm_5526 0.1280 ms 68.3% | |
triton_mm_5527 0.1299 ms 67.3% | |
triton_mm_5528 0.1332 ms 65.6% | |
triton_mm_5531 0.1506 ms 58.0% | |
triton_mm_5524 0.1608 ms 54.3% | |
triton_mm_5530 0.3124 ms 28.0% | |
SingleProcess AUTOTUNE takes 1.5257 seconds | |
AUTOTUNE convolution(2x1280x64x64, 640x1280x3x3) | |
convolution 0.5439 ms 100.0% | |
triton_convolution_5722 1.0597 ms 51.3% | |
triton_convolution_5721 1.1823 ms 46.0% | |
triton_convolution_5724 1.2569 ms 43.3% | |
triton_convolution_5719 1.3822 ms 39.3% | |
triton_convolution_5718 1.5203 ms 35.8% | |
triton_convolution_5723 1.5870 ms 34.3% | |
triton_convolution_5720 7.8675 ms 6.9% | |
SingleProcess AUTOTUNE takes 1.1911 seconds | |
AUTOTUNE int_mm(8192x1280, 1280x640, 8192x640) | |
triton_mm_5745 0.0711 ms 100.0% | |
triton_mm_5744 0.0818 ms 86.9% | |
triton_mm_5737 0.0895 ms 79.5% | |
triton_mm_5739 0.0961 ms 74.0% | |
triton_mm_5738 0.0980 ms 72.6% | |
triton_mm_5740 0.1028 ms 69.2% | |
triton_mm_5746 0.1113 ms 63.9% | |
triton_mm_5743 0.1123 ms 63.3% | |
triton_mm_5736 0.1128 ms 63.0% | |
triton_mm_5742 0.2212 ms 32.2% | |
SingleProcess AUTOTUNE takes 1.3862 seconds | |
AUTOTUNE convolution(2x960x64x64, 640x960x3x3) | |
convolution 0.4033 ms 100.0% | |
triton_convolution_5934 0.7724 ms 52.2% | |
triton_convolution_5933 0.9023 ms 44.7% | |
triton_convolution_5936 0.9333 ms 43.2% | |
triton_convolution_5931 1.0343 ms 39.0% | |
triton_convolution_5930 1.1591 ms 34.8% | |
triton_convolution_5935 1.1836 ms 34.1% | |
triton_convolution_5932 4.3624 ms 9.2% | |
SingleProcess AUTOTUNE takes 1.1393 seconds | |
AUTOTUNE int_mm(8192x960, 960x640, 8192x640) | |
triton_mm_5957 0.0662 ms 100.0% | |
triton_mm_5956 0.0667 ms 99.3% | |
triton_mm_5949 0.0726 ms 91.2% | |
triton_mm_5951 0.0775 ms 85.4% | |
triton_mm_5950 0.0796 ms 83.2% | |
triton_mm_5952 0.0830 ms 79.8% | |
triton_mm_5948 0.0886 ms 74.7% | |
triton_mm_5955 0.0927 ms 71.4% | |
triton_mm_5958 0.1063 ms 62.3% | |
triton_mm_5954 0.1500 ms 44.1% | |
SingleProcess AUTOTUNE takes 1.3660 seconds | |
AUTOTUNE convolution(2x640x128x128, 640x640x3x3) | |
convolution 0.9948 ms 100.0% | |
triton_convolution_6145 2.4293 ms 40.9% | |
triton_convolution_6147 2.6987 ms 36.9% | |
triton_convolution_6146 2.7702 ms 35.9% | |
triton_convolution_6143 2.8436 ms 35.0% | |
triton_convolution_6148 2.8917 ms 34.4% | |
triton_convolution_6142 2.9650 ms 33.6% | |
triton_convolution_6144 17.2755 ms 5.8% | |
SingleProcess AUTOTUNE takes 1.2828 seconds | |
AUTOTUNE convolution(2x960x128x128, 320x960x3x3) | |
convolution 0.9254 ms 100.0% | |
triton_convolution_6153 2.1550 ms 42.9% | |
triton_convolution_6155 2.2798 ms 40.6% | |
triton_convolution_6152 2.2890 ms 40.4% | |
triton_convolution_6150 2.3986 ms 38.6% | |
triton_convolution_6149 2.5845 ms 35.8% | |
triton_convolution_6154 2.7016 ms 34.3% | |
triton_convolution_6151 14.1421 ms 6.5% | |
SingleProcess AUTOTUNE takes 1.2404 seconds | |
AUTOTUNE int_mm(32768x960, 960x320, 32768x320) | |
triton_mm_6175 0.1251 ms 100.0% | |
triton_mm_6169 0.1484 ms 84.3% | |
triton_mm_6176 0.1520 ms 82.3% | |
triton_mm_6168 0.1595 ms 78.5% | |
triton_mm_6171 0.1604 ms 78.0% | |
triton_mm_6170 0.1723 ms 72.6% | |
triton_mm_6167 0.1727 ms 72.4% | |
triton_mm_6174 0.2010 ms 62.3% | |
triton_mm_6177 0.2043 ms 61.2% | |
triton_mm_6173 0.2931 ms 42.7% | |
SingleProcess AUTOTUNE takes 1.4125 seconds | |
AUTOTUNE convolution(2x640x128x128, 320x640x3x3) | |
convolution 0.6210 ms 100.0% | |
triton_convolution_6189 1.3599 ms 45.7% | |
triton_convolution_6186 1.4438 ms 43.0% | |
triton_convolution_6191 1.4582 ms 42.6% | |
triton_convolution_6188 1.5591 ms 39.8% | |
triton_convolution_6185 1.7416 ms 35.7% | |
triton_convolution_6190 1.7610 ms 35.3% | |
triton_convolution_6187 8.7698 ms 7.1% | |
SingleProcess AUTOTUNE takes 1.1876 seconds | |
AUTOTUNE int_mm(32768x640, 640x320, 32768x320) | |
triton_mm_6211 0.0971 ms 100.0% | |
triton_mm_6205 0.1210 ms 80.3% | |
triton_mm_6204 0.1225 ms 79.3% | |
triton_mm_6212 0.1234 ms 78.7% | |
triton_mm_6207 0.1321 ms 73.5% | |
triton_mm_6203 0.1331 ms 73.0% | |
triton_mm_6206 0.1379 ms 70.4% | |
triton_mm_6210 0.1593 ms 61.0% | |
triton_mm_6213 0.1802 ms 53.9% | |
triton_mm_6209 0.2430 ms 40.0% | |
SingleProcess AUTOTUNE takes 1.3959 seconds | |
AUTOTUNE convolution(2x320x128x128, 4x320x3x3) | |
triton_convolution_6261 0.0917 ms 100.0% | |
triton_convolution_6260 0.1050 ms 87.4% | |
convolution 0.1056 ms 86.8% | |
triton_convolution_6262 0.1134 ms 80.9% | |
triton_convolution_6257 0.1451 ms 63.2% | |
triton_convolution_6258 0.1810 ms 50.7% | |
triton_convolution_6259 0.3368 ms 27.2% | |
SingleProcess AUTOTUNE takes 0.8769 seconds | |
AUTOTUNE mm(16384x4, 4x4) | |
triton_mm_6268 0.0076 ms 100.0% | |
triton_mm_6265 0.0076 ms 98.7% | |
triton_mm_6267 0.0076 ms 98.7% | |
triton_mm_6269 0.0076 ms 98.7% | |
triton_mm_6270 0.0076 ms 98.7% | |
triton_mm_6271 0.0076 ms 98.7% | |
triton_mm_6264 0.0080 ms 94.4% | |
triton_mm_6266 0.0080 ms 94.4% | |
triton_mm_6263 0.0081 ms 92.9% | |
triton_mm_6272 0.0081 ms 92.9% | |
SingleProcess AUTOTUNE takes 2.7350 seconds | |
AUTOTUNE convolution(1x4x128x128, 512x4x3x3) | |
triton_convolution_6276 0.0365 ms 100.0% | |
triton_convolution_6279 0.0373 ms 97.9% | |
triton_convolution_6278 0.0377 ms 96.9% | |
triton_convolution_6274 0.0377 ms 96.9% | |
triton_convolution_6277 0.0441 ms 82.8% | |
triton_convolution_6275 0.0615 ms 59.4% | |
triton_convolution_6273 0.0687 ms 53.2% | |
convolution 0.1028 ms 35.6% | |
SingleProcess AUTOTUNE takes 3.6971 seconds | |
AUTOTUNE convolution(1x512x128x128, 512x512x3x3) | |
convolution 0.3211 ms 100.0% | |
triton_convolution_6284 0.6264 ms 51.3% | |
triton_convolution_6285 0.7804 ms 41.1% | |
triton_convolution_6283 0.7852 ms 40.9% | |
triton_convolution_6286 0.7939 ms 40.4% | |
triton_convolution_6281 0.8661 ms 37.1% | |
triton_convolution_6280 0.9005 ms 35.7% | |
triton_convolution_6282 3.7293 ms 8.6% | |
SingleProcess AUTOTUNE takes 3.7248 seconds | |
AUTOTUNE int_mm(16384x512, 512x1536, 16384x1536) | |
triton_mm_6303 0.1797 ms 100.0% | |
triton_mm_6295 0.1835 ms 97.9% | |
triton_mm_6302 0.1851 ms 97.1% | |
triton_mm_6294 0.2210 ms 81.3% | |
triton_mm_6297 0.2269 ms 79.2% | |
triton_mm_6296 0.2301 ms 78.1% | |
triton_mm_6301 0.2555 ms 70.3% | |
triton_mm_6298 0.2709 ms 66.3% | |
triton_mm_6304 0.3348 ms 53.7% | |
triton_mm_6300 0.4639 ms 38.7% | |
SingleProcess AUTOTUNE takes 7.3036 seconds | |
AUTOTUNE int_mm(16384x512, 512x512, 16384x512) | |
triton_mm_6313 0.0678 ms 100.0% | |
triton_mm_6306 0.0723 ms 93.7% | |
triton_mm_6314 0.0745 ms 90.9% | |
triton_mm_6308 0.0852 ms 79.6% | |
triton_mm_6307 0.0875 ms 77.4% | |
triton_mm_6305 0.0886 ms 76.5% | |
triton_mm_6309 0.0998 ms 67.9% | |
triton_mm_6312 0.1007 ms 67.3% | |
triton_mm_6315 0.1316 ms 51.5% | |
triton_mm_6311 0.1608 ms 42.2% | |
SingleProcess AUTOTUNE takes 6.9904 seconds | |
AUTOTUNE convolution(1x512x256x256, 512x512x3x3) | |
convolution 1.2814 ms 100.0% | |
triton_convolution_6377 2.7214 ms 47.1% | |
triton_convolution_6375 2.9714 ms 43.1% | |
triton_convolution_6372 2.9845 ms 42.9% | |
triton_convolution_6376 2.9957 ms 42.8% | |
triton_convolution_6378 3.1786 ms 40.3% | |
triton_convolution_6373 5.3083 ms 24.1% | |
triton_convolution_6374 24.6030 ms 5.2% | |
SingleProcess AUTOTUNE takes 4.1870 seconds | |
AUTOTUNE convolution(1x512x512x512, 512x512x3x3) | |
convolution 5.2513 ms 100.0% | |
triton_convolution_6426 10.4372 ms 50.3% | |
triton_convolution_6421 11.0161 ms 47.7% | |
triton_convolution_6425 11.8240 ms 44.4% | |
triton_convolution_6424 11.8986 ms 44.1% | |
triton_convolution_6427 12.6996 ms 41.4% | |
triton_convolution_6422 21.0352 ms 25.0% | |
triton_convolution_6423 110.2885 ms 4.8% | |
SingleProcess AUTOTUNE takes 5.0062 seconds | |
AUTOTUNE convolution(1x512x512x512, 256x512x3x3) | |
convolution 2.6453 ms 100.0% | |
triton_convolution_6433 5.3004 ms 49.9% | |
triton_convolution_6428 5.5739 ms 47.5% | |
triton_convolution_6432 5.9066 ms 44.8% | |
triton_convolution_6431 5.9353 ms 44.6% | |
triton_convolution_6434 6.3991 ms 41.3% | |
triton_convolution_6429 10.5307 ms 25.1% | |
triton_convolution_6430 55.2580 ms 4.8% | |
SingleProcess AUTOTUNE takes 4.3602 seconds | |
AUTOTUNE int_mm(262144x512, 512x256, 262144x256) | |
triton_mm_6444 0.4486 ms 100.0% | |
triton_mm_6443 0.4817 ms 93.1% | |
triton_mm_6436 0.5208 ms 86.1% | |
triton_mm_6438 0.5990 ms 74.9% | |
triton_mm_6437 0.6070 ms 73.9% | |
triton_mm_6435 0.6330 ms 70.9% | |
triton_mm_6442 0.6744 ms 66.5% | |
triton_mm_6439 0.7054 ms 63.6% | |
triton_mm_6445 0.8072 ms 55.6% | |
triton_mm_6441 1.2277 ms 36.5% | |
SingleProcess AUTOTUNE takes 7.0666 seconds | |
AUTOTUNE convolution(1x256x512x512, 256x256x3x3) | |
convolution 1.3387 ms 100.0% | |
triton_convolution_6450 2.3412 ms 57.2% | |
triton_convolution_6449 2.5590 ms 52.3% | |
triton_convolution_6446 2.6732 ms 50.1% | |
triton_convolution_6451 2.6914 ms 49.7% | |
triton_convolution_6447 2.9103 ms 46.0% | |
triton_convolution_6452 3.1539 ms 42.4% | |
triton_convolution_6448 25.0641 ms 5.3% | |
SingleProcess AUTOTUNE takes 4.0103 seconds | |
AUTOTUNE convolution(1x256x1024x1024, 256x256x3x3) | |
convolution 5.5140 ms 100.0% | |
triton_convolution_6485 9.4846 ms 58.1% | |
triton_convolution_6484 9.9842 ms 55.2% | |
triton_convolution_6481 10.1978 ms 54.1% | |
triton_convolution_6486 10.5290 ms 52.4% | |
triton_convolution_6482 11.3657 ms 48.5% | |
triton_convolution_6487 12.5918 ms 43.8% | |
triton_convolution_6483 99.2370 ms 5.6% | |
SingleProcess AUTOTUNE takes 4.8525 seconds | |
AUTOTUNE convolution(1x256x1024x1024, 128x256x3x3) | |
convolution 2.7501 ms 100.0% | |
triton_convolution_6493 4.6402 ms 59.3% | |
triton_convolution_6492 4.7209 ms 58.3% | |
triton_convolution_6491 5.0188 ms 54.8% | |
triton_convolution_6488 5.6468 ms 48.7% | |
triton_convolution_6489 5.7152 ms 48.1% | |
triton_convolution_6494 6.3367 ms 43.4% | |
triton_convolution_6490 49.6019 ms 5.5% | |
SingleProcess AUTOTUNE takes 4.2204 seconds | |
AUTOTUNE int_mm(1048576x256, 256x128, 1048576x128) | |
triton_mm_6503 0.6694 ms 100.0% | |
triton_mm_6496 0.7059 ms 94.8% | |
triton_mm_6504 0.7864 ms 85.1% | |
triton_mm_6495 0.8326 ms 80.4% | |
triton_mm_6498 0.8461 ms 79.1% | |
triton_mm_6497 0.9266 ms 72.2% | |
triton_mm_6502 1.0401 ms 64.4% | |
triton_mm_6499 1.0616 ms 63.1% | |
triton_mm_6505 1.4784 ms 45.3% | |
triton_mm_6501 1.5521 ms 43.1% | |
SingleProcess AUTOTUNE takes 6.5384 seconds | |
AUTOTUNE convolution(1x128x1024x1024, 128x128x3x3) | |
convolution 1.4754 ms 100.0% | |
triton_convolution_6511 2.2884 ms 64.5% | |
triton_convolution_6510 2.5110 ms 58.8% | |
triton_convolution_6509 2.5944 ms 56.9% | |
triton_convolution_6512 2.8438 ms 51.9% | |
triton_convolution_6507 2.8696 ms 51.4% | |
triton_convolution_6506 3.1059 ms 47.5% | |
triton_convolution_6508 17.1922 ms 8.6% | |
SingleProcess AUTOTUNE takes 3.7681 seconds | |
AUTOTUNE convolution(1x128x1024x1024, 3x128x3x3) | |
triton_convolution_6546 0.6457 ms 100.0% | |
triton_convolution_6545 0.7136 ms 90.5% | |
convolution 0.7313 ms 88.3% | |
triton_convolution_6544 0.7508 ms 86.0% | |
triton_convolution_6541 1.3129 ms 49.2% | |
triton_convolution_6543 1.3212 ms 48.9% | |
triton_convolution_6542 1.6742 ms 38.6% | |
SingleProcess AUTOTUNE takes 2.9924 seconds | |
/home/cdhernandez/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. | |
_torch_pytree._register_pytree_node( | |
/home/cdhernandez/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. | |
_torch_pytree._register_pytree_node( | |
/home/cdhernandez/local/diffusers/src/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. | |
torch.utils._pytree._register_pytree_node( | |
Namespace(no_bf16=False, no_sdpa=False, batch_size=1, num_inference_steps=30, enable_fused_projections=True, upcast_vae=False, compile_unet=True, compile_vae=True, compile_mode='max-autotune', change_comp_config=True, do_quant=None, tag='branch12-1-all') | |
Using dtype: torch.bfloat16 | |
Loading pipeline components...: 0%| | 0/7 [00:00<?, ?it/s] | |
Loading pipeline components...: 14%|█▍ | 1/7 [00:01<00:07, 1.20s/it] | |
Loading pipeline components...: 43%|████▎ | 3/7 [00:01<00:01, 2.75it/s] | |
Loading pipeline components...: 57%|█████▋ | 4/7 [00:01<00:01, 2.48it/s]/home/cdhernandez/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.0 | |
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" | |
/home/cdhernandez/local/diffusers/src/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. | |
torch.utils._pytree._register_pytree_node( | |
Loading pipeline components...: 71%|███████▏ | 5/7 [00:02<00:00, 2.86it/s] | |
Loading pipeline components...: 86%|████████▌ | 6/7 [00:02<00:00, 3.29it/s] | |
Loading pipeline components...: 100%|██████████| 7/7 [00:02<00:00, 4.16it/s] | |
Loading pipeline components...: 100%|██████████| 7/7 [00:02<00:00, 2.95it/s] | |
Enabling fused QKV projections for both UNet and VAE. | |
Compile UNet | |
Compile VAE | |
AUTOTUNE mm(2x320, 320x1280) | |
triton_mm_22 0.0099 ms 100.0% | |
triton_mm_19 0.0100 ms 98.1% | |
triton_mm_20 0.0101 ms 97.5% | |
triton_mm_23 0.0101 ms 97.5% | |
triton_mm_18 0.0104 ms 94.5% | |
mm 0.0109 ms 90.3% | |
triton_mm_16 0.0112 ms 88.0% | |
triton_mm_17 0.0112 ms 88.0% | |
triton_mm_15 0.0124 ms 79.8% | |
triton_mm_14 0.0139 ms 70.8% | |
SingleProcess AUTOTUNE takes 2.3171 seconds | |
AUTOTUNE mm(2x1280, 1280x1280) | |
mm 0.0170 ms 100.0% | |
triton_mm_31 0.0175 ms 97.1% | |
triton_mm_32 0.0176 ms 96.6% | |
triton_mm_34 0.0177 ms 96.4% | |
triton_mm_35 0.0183 ms 93.2% | |
triton_mm_30 0.0201 ms 84.6% | |
triton_mm_29 0.0220 ms 77.6% | |
triton_mm_28 0.0250 ms 68.2% | |
triton_mm_27 0.0279 ms 61.0% | |
triton_mm_26 0.0361 ms 47.1% | |
SingleProcess AUTOTUNE takes 1.5956 seconds | |
AUTOTUNE mm(2x2816, 2816x1280) | |
mm 0.0200 ms 100.0% | |
triton_mm_43 0.0288 ms 69.5% | |
triton_mm_47 0.0294 ms 68.1% | |
triton_mm_46 0.0294 ms 68.0% | |
triton_mm_44 0.0297 ms 67.5% | |
triton_mm_42 0.0348 ms 57.5% | |
triton_mm_41 0.0386 ms 51.9% | |
triton_mm_40 0.0447 ms 44.8% | |
triton_mm_39 0.0486 ms 41.2% | |
triton_mm_38 0.0723 ms 27.7% | |
SingleProcess AUTOTUNE takes 1.7031 seconds | |
AUTOTUNE mm(2x1280, 1280x320) | |
mm 0.0135 ms 100.0% | |
triton_mm_67 0.0150 ms 90.4% | |
triton_mm_68 0.0161 ms 84.3% | |
triton_mm_70 0.0165 ms 82.1% | |
triton_mm_71 0.0169 ms 80.1% | |
triton_mm_66 0.0185 ms 73.3% | |
triton_mm_65 0.0204 ms 66.3% | |
triton_mm_64 0.0228 ms 59.2% | |
triton_mm_63 0.0257 ms 52.6% | |
triton_mm_62 0.0324 ms 41.8% | |
SingleProcess AUTOTUNE takes 1.9522 seconds | |
AUTOTUNE mm(2x1280, 1280x640) | |
mm 0.0146 ms 100.0% | |
triton_mm_127 0.0164 ms 89.4% | |
triton_mm_126 0.0165 ms 88.7% | |
triton_mm_129 0.0174 ms 83.9% | |
triton_mm_130 0.0176 ms 83.2% | |
triton_mm_125 0.0193 ms 75.7% | |
triton_mm_124 0.0207 ms 70.6% | |
triton_mm_123 0.0245 ms 59.7% | |
triton_mm_122 0.0264 ms 55.3% | |
triton_mm_121 0.0355 ms 41.2% | |
SingleProcess AUTOTUNE takes 1.9335 seconds | |
AUTOTUNE addmm(8192x640, 8192x320, 320x640) | |
triton_mm_135 0.0348 ms 100.0% | |
triton_mm_134 0.0353 ms 98.6% | |
triton_mm_137 0.0390 ms 89.2% | |
triton_mm_136 0.0393 ms 88.7% | |
bias_addmm 0.0396 ms 88.0% | |
triton_mm_141 0.0444 ms 78.3% | |
triton_mm_133 0.0452 ms 77.0% | |
triton_mm_140 0.0493 ms 70.6% | |
addmm 0.0575 ms 60.6% | |
triton_mm_143 0.0703 ms 49.5% | |
SingleProcess AUTOTUNE takes 1.7301 seconds | |
AUTOTUNE mm(8192x640, 640x640) | |
triton_mm_154 0.0501 ms 100.0% | |
triton_mm_153 0.0504 ms 99.5% | |
mm 0.0531 ms 94.4% | |
triton_mm_156 0.0556 ms 90.2% | |
triton_mm_155 0.0561 ms 89.4% | |
triton_mm_160 0.0645 ms 77.7% | |
triton_mm_152 0.0686 ms 73.1% | |
triton_mm_159 0.0803 ms 62.5% | |
triton_mm_161 0.1101 ms 45.5% | |
triton_mm_162 0.1127 ms 44.5% | |
SingleProcess AUTOTUNE takes 1.6039 seconds | |
AUTOTUNE mm(8192x640, 640x1920) | |
mm 0.1100 ms 100.0% | |
triton_mm_166 0.1239 ms 88.8% | |
triton_mm_165 0.1246 ms 88.2% | |
triton_mm_168 0.1418 ms 77.5% | |
triton_mm_167 0.1428 ms 77.0% | |
triton_mm_164 0.1657 ms 66.4% | |
triton_mm_172 0.1706 ms 64.5% | |
triton_mm_171 0.2090 ms 52.6% | |
triton_mm_174 0.2565 ms 42.9% | |
triton_mm_173 0.3039 ms 36.2% | |
SingleProcess AUTOTUNE takes 1.8195 seconds | |
AUTOTUNE mm(154x2048, 2048x1280) | |
mm 0.0187 ms 100.0% | |
triton_mm_194 0.0245 ms 76.3% | |
triton_mm_197 0.0275 ms 68.2% | |
triton_mm_196 0.0283 ms 66.3% | |
triton_mm_191 0.0318 ms 58.8% | |
triton_mm_192 0.0320 ms 58.5% | |
triton_mm_193 0.0356 ms 52.6% | |
triton_mm_189 0.0412 ms 45.5% | |
triton_mm_190 0.0416 ms 45.0% | |
triton_mm_188 0.0517 ms 36.2% | |
SingleProcess AUTOTUNE takes 1.5919 seconds | |
AUTOTUNE mm(8192x640, 640x5120) | |
mm 0.2638 ms 100.0% | |
triton_mm_225 0.3056 ms 86.3% | |
triton_mm_226 0.3061 ms 86.2% | |
triton_mm_227 0.3472 ms 76.0% | |
triton_mm_228 0.3480 ms 75.8% | |
triton_mm_224 0.4080 ms 64.7% | |
triton_mm_232 0.4302 ms 61.3% | |
triton_mm_231 0.5111 ms 51.6% | |
triton_mm_234 0.6502 ms 40.6% | |
triton_mm_233 0.7953 ms 33.2% | |
SingleProcess AUTOTUNE takes 1.7405 seconds | |
AUTOTUNE mm(8192x2560, 2560x640) | |
mm 0.1526 ms 100.0% | |
triton_mm_238 0.1588 ms 96.1% | |
triton_mm_237 0.1623 ms 94.0% | |
triton_mm_239 0.1788 ms 85.3% | |
triton_mm_240 0.1802 ms 84.7% | |
triton_mm_244 0.2058 ms 74.2% | |
triton_mm_236 0.2258 ms 67.6% | |
triton_mm_243 0.2656 ms 57.5% | |
triton_mm_245 0.3832 ms 39.8% | |
triton_mm_241 0.4031 ms 37.9% | |
SingleProcess AUTOTUNE takes 1.6869 seconds | |
AUTOTUNE addmm(2048x1280, 2048x640, 640x1280) | |
triton_mm_589 0.0299 ms 100.0% | |
triton_mm_590 0.0301 ms 99.3% | |
bias_addmm 0.0315 ms 94.9% | |
triton_mm_592 0.0331 ms 90.4% | |
triton_mm_591 0.0335 ms 89.2% | |
triton_mm_596 0.0380 ms 78.7% | |
addmm 0.0414 ms 72.2% | |
triton_mm_588 0.0425 ms 70.5% | |
triton_mm_595 0.0507 ms 59.0% | |
triton_mm_598 0.0563 ms 53.2% | |
SingleProcess AUTOTUNE takes 1.7261 seconds | |
AUTOTUNE mm(2048x1280, 1280x1280) | |
mm 0.0462 ms 100.0% | |
triton_mm_609 0.0469 ms 98.6% | |
triton_mm_608 0.0473 ms 97.8% | |
triton_mm_610 0.0513 ms 90.2% | |
triton_mm_611 0.0514 ms 90.0% | |
triton_mm_615 0.0609 ms 76.0% | |
triton_mm_607 0.0708 ms 65.3% | |
triton_mm_614 0.0908 ms 50.9% | |
triton_mm_617 0.0996 ms 46.4% | |
triton_mm_616 0.1107 ms 41.8% | |
SingleProcess AUTOTUNE takes 1.5984 seconds | |
AUTOTUNE mm(2048x1280, 1280x3840) | |
triton_mm_621 0.1209 ms 100.0% | |
triton_mm_620 0.1212 ms 99.7% | |
mm 0.1238 ms 97.6% | |
triton_mm_623 0.1330 ms 90.9% | |
triton_mm_622 0.1334 ms 90.6% | |
triton_mm_627 0.1611 ms 75.0% | |
triton_mm_619 0.1611 ms 75.0% | |
triton_mm_626 0.2197 ms 55.0% | |
triton_mm_629 0.2588 ms 46.7% | |
triton_mm_628 0.2884 ms 41.9% | |
SingleProcess AUTOTUNE takes 1.6521 seconds | |
AUTOTUNE mm(154x2048, 2048x2560) | |
mm 0.0274 ms 100.0% | |
triton_mm_646 0.0339 ms 80.9% | |
triton_mm_647 0.0344 ms 79.8% | |
triton_mm_649 0.0356 ms 77.1% | |
triton_mm_651 0.0356 ms 77.0% | |
triton_mm_652 0.0376 ms 73.0% | |
triton_mm_645 0.0445 ms 61.6% | |
triton_mm_644 0.0448 ms 61.2% | |
triton_mm_648 0.0488 ms 56.2% | |
triton_mm_643 0.0551 ms 49.8% | |
SingleProcess AUTOTUNE takes 1.5864 seconds | |
AUTOTUNE mm(2048x1280, 1280x10240) | |
mm 0.2451 ms 100.0% | |
triton_mm_681 0.3051 ms 80.3% | |
triton_mm_680 0.3140 ms 78.1% | |
triton_mm_683 0.3345 ms 73.3% | |
triton_mm_682 0.3448 ms 71.1% | |
triton_mm_687 0.4073 ms 60.2% | |
triton_mm_679 0.4267 ms 57.4% | |
triton_mm_686 0.5201 ms 47.1% | |
triton_mm_689 0.6718 ms 36.5% | |
triton_mm_688 0.7431 ms 33.0% | |
SingleProcess AUTOTUNE takes 1.7342 seconds | |
AUTOTUNE mm(2048x5120, 5120x1280) | |
mm 0.1448 ms 100.0% | |
triton_mm_693 0.1533 ms 94.4% | |
triton_mm_692 0.1548 ms 93.5% | |
triton_mm_695 0.1787 ms 81.0% | |
triton_mm_694 0.1792 ms 80.8% | |
triton_mm_699 0.2082 ms 69.5% | |
triton_mm_691 0.2635 ms 54.9% | |
triton_mm_698 0.3486 ms 41.5% | |
triton_mm_701 0.3604 ms 40.2% | |
triton_mm_696 0.4027 ms 36.0% | |
SingleProcess AUTOTUNE takes 1.6905 seconds | |
AUTOTUNE addmm(2048x1280, 2048x2560, 2560x1280) | |
bias_addmm 0.0805 ms 100.0% | |
triton_mm_3297 0.0860 ms 93.7% | |
triton_mm_3298 0.0866 ms 93.0% | |
addmm 0.0937 ms 86.0% | |
triton_mm_3299 0.0974 ms 82.7% | |
triton_mm_3300 0.0981 ms 82.1% | |
triton_mm_3304 0.1111 ms 72.5% | |
triton_mm_3296 0.1369 ms 58.8% | |
triton_mm_3303 0.1657 ms 48.6% | |
triton_mm_3306 0.1950 ms 41.3% | |
SingleProcess AUTOTUNE takes 1.8599 seconds | |
AUTOTUNE addmm(2048x1280, 2048x1920, 1920x1280) | |
bias_addmm 0.0640 ms 100.0% | |
triton_mm_5102 0.0663 ms 96.6% | |
triton_mm_5101 0.0668 ms 95.8% | |
triton_mm_5103 0.0752 ms 85.1% | |
addmm 0.0755 ms 84.8% | |
triton_mm_5104 0.0757 ms 84.6% | |
triton_mm_5108 0.0851 ms 75.2% | |
triton_mm_5100 0.1038 ms 61.7% | |
triton_mm_5107 0.1256 ms 51.0% | |
triton_mm_5110 0.1460 ms 43.9% | |
SingleProcess AUTOTUNE takes 1.7353 seconds | |
AUTOTUNE addmm(8192x640, 8192x1920, 1920x640) | |
bias_addmm 0.1219 ms 100.0% | |
triton_mm_6011 0.1244 ms 97.9% | |
triton_mm_6010 0.1270 ms 96.0% | |
triton_mm_6012 0.1404 ms 86.8% | |
triton_mm_6013 0.1416 ms 86.1% | |
addmm 0.1439 ms 84.7% | |
triton_mm_6017 0.1606 ms 75.9% | |
triton_mm_6009 0.1814 ms 67.2% | |
triton_mm_6016 0.2005 ms 60.8% | |
triton_mm_6018 0.2916 ms 41.8% | |
SingleProcess AUTOTUNE takes 1.7940 seconds | |
AUTOTUNE addmm(8192x640, 8192x1280, 1280x640) | |
bias_addmm 0.0890 ms 100.0% | |
triton_mm_6241 0.0891 ms 99.9% | |
triton_mm_6240 0.0892 ms 99.7% | |
triton_mm_6243 0.0986 ms 90.3% | |
triton_mm_6242 0.0989 ms 89.9% | |
addmm 0.1106 ms 80.4% | |
triton_mm_6247 0.1136 ms 78.3% | |
triton_mm_6239 0.1235 ms 72.1% | |
triton_mm_6246 0.1423 ms 62.5% | |
triton_mm_6248 0.1979 ms 45.0% | |
SingleProcess AUTOTUNE takes 1.7492 seconds | |
AUTOTUNE addmm(8192x640, 8192x960, 960x640) | |
triton_mm_6471 0.0698 ms 100.0% | |
triton_mm_6470 0.0703 ms 99.2% | |
bias_addmm 0.0728 ms 95.9% | |
triton_mm_6472 0.0768 ms 90.9% | |
triton_mm_6473 0.0776 ms 89.9% | |
triton_mm_6477 0.0905 ms 77.1% | |
addmm 0.0922 ms 75.7% | |
triton_mm_6469 0.0973 ms 71.7% | |
triton_mm_6476 0.1114 ms 62.6% | |
triton_mm_6478 0.1566 ms 44.5% | |
SingleProcess AUTOTUNE takes 1.7469 seconds | |
AUTOTUNE addmm(32768x320, 32768x960, 960x320) | |
bias_addmm 0.1316 ms 100.0% | |
triton_mm_6708 0.1378 ms 95.5% | |
triton_mm_6710 0.1519 ms 86.6% | |
triton_mm_6707 0.1589 ms 82.8% | |
triton_mm_6714 0.1726 ms 76.3% | |
triton_mm_6709 0.1778 ms 74.0% | |
triton_mm_6706 0.1794 ms 73.3% | |
addmm 0.1834 ms 71.7% | |
triton_mm_6713 0.2497 ms 52.7% | |
triton_mm_6716 0.2946 ms 44.7% | |
SingleProcess AUTOTUNE takes 1.7837 seconds | |
AUTOTUNE addmm(32768x320, 32768x640, 640x320) | |
bias_addmm 0.0980 ms 100.0% | |
triton_mm_6746 0.1000 ms 98.0% | |
triton_mm_6748 0.1111 ms 88.2% | |
triton_mm_6745 0.1191 ms 82.3% | |
triton_mm_6752 0.1269 ms 77.2% | |
triton_mm_6747 0.1308 ms 74.9% | |
triton_mm_6744 0.1320 ms 74.2% | |
addmm 0.1484 ms 66.0% | |
triton_mm_6751 0.1791 ms 54.7% | |
triton_mm_6753 0.2162 ms 45.3% | |
SingleProcess AUTOTUNE takes 1.7716 seconds | |
AUTOTUNE addmm(16384x4, 16384x4, 4x4) | |
triton_mm_6808 0.0078 ms 100.0% | |
triton_mm_6810 0.0078 ms 100.0% | |
triton_mm_6813 0.0078 ms 99.6% | |
triton_mm_6814 0.0080 ms 97.2% | |
triton_mm_6809 0.0083 ms 93.5% | |
triton_mm_6812 0.0084 ms 92.7% | |
triton_mm_6816 0.0084 ms 92.7% | |
triton_mm_6811 0.0085 ms 91.7% | |
triton_mm_6807 0.0085 ms 91.7% | |
triton_mm_6815 0.0085 ms 91.4% | |
SingleProcess AUTOTUNE takes 2.9463 seconds | |
AUTOTUNE convolution(1x4x128x128, 512x4x3x3) | |
triton_convolution_6817 0.0357 ms 100.0% | |
triton_convolution_6818 0.0385 ms 92.8% | |
triton_convolution_6822 0.0389 ms 91.9% | |
triton_convolution_6823 0.0395 ms 90.4% | |
triton_convolution_6820 0.0396 ms 90.2% | |
triton_convolution_6821 0.0439 ms 81.4% | |
triton_convolution_6819 0.0788 ms 45.4% | |
convolution 0.1057 ms 33.8% | |
SingleProcess AUTOTUNE takes 3.5332 seconds | |
AUTOTUNE mm(16384x512, 512x1536) | |
mm 0.1338 ms 100.0% | |
triton_mm_6839 0.1546 ms 86.6% | |
triton_mm_6840 0.1561 ms 85.7% | |
triton_mm_6842 0.1712 ms 78.1% | |
triton_mm_6841 0.1757 ms 76.1% | |
triton_mm_6846 0.2186 ms 61.2% | |
triton_mm_6838 0.2230 ms 60.0% | |
triton_mm_6845 0.2416 ms 55.4% | |
triton_mm_6848 0.2796 ms 47.9% | |
triton_mm_6847 0.3956 ms 33.8% | |
SingleProcess AUTOTUNE takes 4.7783 seconds | |
AUTOTUNE mm(16384x512, 512x512) | |
triton_mm_6852 0.0624 ms 100.0% | |
triton_mm_6851 0.0631 ms 98.8% | |
mm 0.0644 ms 96.8% | |
triton_mm_6854 0.0692 ms 90.1% | |
triton_mm_6853 0.0695 ms 89.7% | |
triton_mm_6858 0.0800 ms 77.9% | |
triton_mm_6850 0.0818 ms 76.3% | |
triton_mm_6857 0.0995 ms 62.7% | |
triton_mm_6860 0.1326 ms 47.0% | |
triton_mm_6859 0.1441 ms 43.3% | |
SingleProcess AUTOTUNE takes 4.5120 seconds | |
AUTOTUNE addmm(262144x256, 262144x512, 512x256) | |
bias_addmm 0.3736 ms 100.0% | |
triton_mm_6983 0.4613 ms 81.0% | |
triton_mm_6982 0.4628 ms 80.7% | |
triton_mm_6985 0.5117 ms 73.0% | |
triton_mm_6984 0.5124 ms 72.9% | |
triton_mm_6989 0.6008 ms 62.2% | |
triton_mm_6981 0.6127 ms 61.0% | |
addmm 0.6218 ms 60.1% | |
triton_mm_6988 0.7352 ms 50.8% | |
triton_mm_6991 0.9670 ms 38.6% | |
SingleProcess AUTOTUNE takes 5.1245 seconds | |
AUTOTUNE addmm(1048576x128, 1048576x256, 256x128) | |
triton_mm_7044 0.6243 ms 100.0% | |
bias_addmm 0.6268 ms 99.6% | |
triton_mm_7046 0.6617 ms 94.3% | |
triton_mm_7043 0.6851 ms 91.1% | |
triton_mm_7045 0.7136 ms 87.5% | |
triton_mm_7050 0.7885 ms 79.2% | |
triton_mm_7042 0.8139 ms 76.7% | |
triton_mm_7049 0.9098 ms 68.6% | |
addmm 1.1696 ms 53.4% | |
triton_mm_7047 1.3652 ms 45.7% | |
SingleProcess AUTOTUNE takes 4.9210 seconds |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment