Last active
July 9, 2025 23:42
-
-
Save davidberard98/c316464b8c193cf45b8c779a443bafef to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 8.7 | |
.target sm_90a | |
.address_size 64 | |
// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel | |
.extern .shared .align 16 .b8 global_smem[]; | |
// @_layer_norm_backward_kernel | |
.visible .entry _layer_norm_backward_kernel( | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_0, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_1, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_2, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_3, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_4, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_5, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_6, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_7, | |
.param .u32 _layer_norm_backward_kernel_param_8, | |
.param .u32 _layer_norm_backward_kernel_param_9, | |
.param .u32 _layer_norm_backward_kernel_param_10, | |
.param .u32 _layer_norm_backward_kernel_param_11, | |
.param .u32 _layer_norm_backward_kernel_param_12, | |
.param .u32 _layer_norm_backward_kernel_param_13, | |
.param .u32 _layer_norm_backward_kernel_param_14, | |
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_15 | |
) | |
.reqntid 128 | |
{ | |
.reg .pred %p<118>; | |
.reg .b32 %r<2918>; | |
.reg .b64 %rd<168>; | |
.loc 1 73 0 // layer_norm.py:73:0 | |
$L__func_begin0: | |
.loc 1 73 0 // layer_norm.py:73:0 | |
// %bb.0: | |
ld.param.b32 %r542, [_layer_norm_backward_kernel_param_14]; | |
ld.param.b32 %r540, [_layer_norm_backward_kernel_param_11]; | |
ld.param.b32 %r539, [_layer_norm_backward_kernel_param_10]; | |
ld.param.b64 %rd46, [_layer_norm_backward_kernel_param_6]; | |
ld.param.b64 %rd45, [_layer_norm_backward_kernel_param_5]; | |
$L__tmp0: | |
.loc 1 100 33 // layer_norm.py:100:33 | |
mov.u32 %r1, %ctaid.x; | |
.loc 1 101 31 // layer_norm.py:101:31 | |
shl.b32 %r2, %r1, 5; | |
.loc 1 102 39 // layer_norm.py:102:39 | |
add.s32 %r671, %r2, 32; | |
ld.param.b32 %r672, [_layer_norm_backward_kernel_param_13]; | |
.loc 1 102 57 // layer_norm.py:102:57 | |
min.s32 %r3, %r671, %r672; | |
.loc 1 103 24 // layer_norm.py:103:24 | |
mov.u32 %r4, %tid.x; | |
shl.b32 %r673, %r4, 2; | |
and.b32 %r674, %r673, 508; | |
or.b32 %r5, %r674, 512; | |
or.b32 %r6, %r674, 1024; | |
or.b32 %r7, %r674, 1536; | |
or.b32 %r8, %r674, 2048; | |
or.b32 %r9, %r674, 2560; | |
or.b32 %r10, %r674, 3072; | |
or.b32 %r675, %r673, 3584; | |
or.b32 %r11, %r674, 4096; | |
or.b32 %r12, %r674, 4608; | |
or.b32 %r13, %r674, 5120; | |
or.b32 %r14, %r674, 5632; | |
or.b32 %r15, %r674, 6144; | |
or.b32 %r16, %r674, 6656; | |
or.b32 %r17, %r674, 7168; | |
or.b32 %r676, %r673, 7680; | |
.loc 1 117 28 // layer_norm.py:117:28 | |
cvt.u64.u32 %rd1, %r674; | |
cvt.u64.u32 %rd9, %r675; | |
cvt.u64.u32 %rd18, %r676; | |
.loc 1 115 30 // layer_norm.py:115:30 | |
setp.le.s32 %p1, %r3, %r2; | |
mov.b32 %r2790, 0; | |
shl.b64 %rd161, %rd18, 2; | |
shl.b64 %rd162, %rd9, 2; | |
setp.lt.s32 %p87, %r17, %r542; | |
cvt.u32.u64 %r2240, %rd18; | |
cvt.u32.u64 %r2241, %rd9; | |
cvt.u32.u64 %r2242, %rd1; | |
setp.lt.s32 %p86, %r16, %r542; | |
setp.lt.s32 %p85, %r15, %r542; | |
setp.lt.s32 %p84, %r14, %r542; | |
setp.lt.s32 %p83, %r13, %r542; | |
setp.lt.s32 %p82, %r12, %r542; | |
setp.lt.s32 %p81, %r11, %r542; | |
setp.lt.s32 %p79, %r10, %r542; | |
setp.lt.s32 %p78, %r9, %r542; | |
setp.lt.s32 %p77, %r8, %r542; | |
setp.lt.s32 %p76, %r7, %r542; | |
setp.lt.s32 %p75, %r6, %r542; | |
setp.lt.s32 %p74, %r5, %r542; | |
mov.b32 %r2791, %r2790; | |
mov.b32 %r2792, %r2790; | |
mov.b32 %r2793, %r2790; | |
mov.b32 %r2794, %r2790; | |
mov.b32 %r2795, %r2790; | |
mov.b32 %r2796, %r2790; | |
mov.b32 %r2797, %r2790; | |
mov.b32 %r2798, %r2790; | |
mov.b32 %r2799, %r2790; | |
mov.b32 %r2800, %r2790; | |
mov.b32 %r2801, %r2790; | |
mov.b32 %r2802, %r2790; | |
mov.b32 %r2803, %r2790; | |
mov.b32 %r2804, %r2790; | |
mov.b32 %r2805, %r2790; | |
mov.b32 %r2806, %r2790; | |
mov.b32 %r2807, %r2790; | |
mov.b32 %r2808, %r2790; | |
mov.b32 %r2809, %r2790; | |
mov.b32 %r2810, %r2790; | |
mov.b32 %r2811, %r2790; | |
mov.b32 %r2812, %r2790; | |
mov.b32 %r2813, %r2790; | |
mov.b32 %r2814, %r2790; | |
mov.b32 %r2815, %r2790; | |
mov.b32 %r2816, %r2790; | |
mov.b32 %r2817, %r2790; | |
mov.b32 %r2818, %r2790; | |
mov.b32 %r2819, %r2790; | |
mov.b32 %r2820, %r2790; | |
mov.b32 %r2821, %r2790; | |
mov.b32 %r2822, %r2790; | |
mov.b32 %r2823, %r2790; | |
mov.b32 %r2824, %r2790; | |
mov.b32 %r2825, %r2790; | |
mov.b32 %r2826, %r2790; | |
mov.b32 %r2827, %r2790; | |
mov.b32 %r2828, %r2790; | |
mov.b32 %r2829, %r2790; | |
mov.b32 %r2830, %r2790; | |
mov.b32 %r2831, %r2790; | |
mov.b32 %r2832, %r2790; | |
mov.b32 %r2833, %r2790; | |
mov.b32 %r2834, %r2790; | |
mov.b32 %r2835, %r2790; | |
mov.b32 %r2836, %r2790; | |
mov.b32 %r2837, %r2790; | |
mov.b32 %r2838, %r2790; | |
mov.b32 %r2839, %r2790; | |
mov.b32 %r2840, %r2790; | |
mov.b32 %r2841, %r2790; | |
mov.b32 %r2842, %r2790; | |
mov.b32 %r2843, %r2790; | |
mov.b32 %r2844, %r2790; | |
mov.b32 %r2845, %r2790; | |
mov.b32 %r2846, %r2790; | |
mov.b32 %r2847, %r2790; | |
mov.b32 %r2848, %r2790; | |
mov.b32 %r2849, %r2790; | |
mov.b32 %r2850, %r2790; | |
mov.b32 %r2851, %r2790; | |
mov.b32 %r2852, %r2790; | |
mov.b32 %r2853, %r2790; | |
mov.b32 %r2854, %r2790; | |
mov.b32 %r2855, %r2790; | |
mov.b32 %r2856, %r2790; | |
mov.b32 %r2857, %r2790; | |
mov.b32 %r2858, %r2790; | |
mov.b32 %r2859, %r2790; | |
mov.b32 %r2860, %r2790; | |
mov.b32 %r2861, %r2790; | |
mov.b32 %r2862, %r2790; | |
mov.b32 %r2863, %r2790; | |
mov.b32 %r2864, %r2790; | |
mov.b32 %r2865, %r2790; | |
mov.b32 %r2866, %r2790; | |
mov.b32 %r2867, %r2790; | |
mov.b32 %r2868, %r2790; | |
mov.b32 %r2869, %r2790; | |
mov.b32 %r2870, %r2790; | |
mov.b32 %r2871, %r2790; | |
mov.b32 %r2872, %r2790; | |
mov.b32 %r2873, %r2790; | |
mov.b32 %r2874, %r2790; | |
mov.b32 %r2875, %r2790; | |
mov.b32 %r2876, %r2790; | |
mov.b32 %r2877, %r2790; | |
mov.b32 %r2878, %r2790; | |
mov.b32 %r2879, %r2790; | |
mov.b32 %r2880, %r2790; | |
mov.b32 %r2881, %r2790; | |
mov.b32 %r2882, %r2790; | |
mov.b32 %r2883, %r2790; | |
mov.b32 %r2884, %r2790; | |
mov.b32 %r2885, %r2790; | |
mov.b32 %r2886, %r2790; | |
mov.b32 %r2887, %r2790; | |
mov.b32 %r2888, %r2790; | |
mov.b32 %r2889, %r2790; | |
mov.b32 %r2890, %r2790; | |
mov.b32 %r2891, %r2790; | |
mov.b32 %r2892, %r2790; | |
mov.b32 %r2893, %r2790; | |
mov.b32 %r2894, %r2790; | |
mov.b32 %r2895, %r2790; | |
mov.b32 %r2896, %r2790; | |
mov.b32 %r2897, %r2790; | |
mov.b32 %r2898, %r2790; | |
mov.b32 %r2899, %r2790; | |
mov.b32 %r2900, %r2790; | |
mov.b32 %r2901, %r2790; | |
mov.b32 %r2902, %r2790; | |
mov.b32 %r2903, %r2790; | |
mov.b32 %r2904, %r2790; | |
mov.b32 %r2905, %r2790; | |
mov.b32 %r2906, %r2790; | |
mov.b32 %r2907, %r2790; | |
mov.b32 %r2908, %r2790; | |
mov.b32 %r2909, %r2790; | |
mov.b32 %r2910, %r2790; | |
mov.b32 %r2911, %r2790; | |
mov.b32 %r2912, %r2790; | |
mov.b32 %r2913, %r2790; | |
mov.b32 %r2914, %r2790; | |
mov.b32 %r2915, %r2790; | |
mov.b32 %r2916, %r2790; | |
mov.b32 %r2917, %r2790; | |
@%p1 bra $L__BB0_3; | |
// %bb.1: // %.lr.ph | |
.loc 1 0 30 // layer_norm.py:0:30 | |
ld.param.b32 %r541, [_layer_norm_backward_kernel_param_12]; | |
ld.param.b32 %r538, [_layer_norm_backward_kernel_param_9]; | |
ld.param.b32 %r537, [_layer_norm_backward_kernel_param_8]; | |
ld.param.b64 %rd47, [_layer_norm_backward_kernel_param_7]; | |
ld.param.b64 %rd44, [_layer_norm_backward_kernel_param_4]; | |
ld.param.b64 %rd43, [_layer_norm_backward_kernel_param_3]; | |
ld.param.b64 %rd42, [_layer_norm_backward_kernel_param_2]; | |
ld.param.b64 %rd41, [_layer_norm_backward_kernel_param_0]; | |
ld.param.b64 %rd48, [_layer_norm_backward_kernel_param_1]; | |
mul.wide.u32 %rd49, %r674, 4; | |
add.s64 %rd72, %rd48, %rd49; | |
add.s64 %rd73, %rd72, 2048; | |
add.s64 %rd74, %rd72, 4096; | |
add.s64 %rd75, %rd72, 6144; | |
add.s64 %rd76, %rd72, 8192; | |
add.s64 %rd77, %rd72, 10240; | |
add.s64 %rd78, %rd72, 12288; | |
mul.wide.u32 %rd50, %r675, 4; | |
add.s64 %rd79, %rd48, %rd50; | |
add.s64 %rd80, %rd72, 16384; | |
add.s64 %rd81, %rd72, 18432; | |
add.s64 %rd82, %rd72, 20480; | |
add.s64 %rd83, %rd72, 22528; | |
add.s64 %rd84, %rd72, 24576; | |
add.s64 %rd85, %rd72, 26624; | |
add.s64 %rd86, %rd72, 28672; | |
mul.wide.u32 %rd51, %r676, 4; | |
add.s64 %rd87, %rd48, %rd51; | |
cvt.rn.f32.s32 %r18, %r542; | |
.loc 1 113 26 // layer_norm.py:113:26 | |
mul.lo.s32 %r805, %r541, %r2; | |
.loc 1 113 14 // layer_norm.py:113:14 | |
mul.wide.s32 %rd52, %r805, 4; | |
add.s64 %rd165, %rd47, %rd52; | |
.loc 1 112 26 // layer_norm.py:112:26 | |
mul.lo.s32 %r806, %r538, %r2; | |
.loc 1 112 14 // layer_norm.py:112:14 | |
mul.wide.s32 %rd53, %r806, 4; | |
add.s64 %rd164, %rd44, %rd53; | |
.loc 1 111 16 // layer_norm.py:111:16 | |
mul.wide.s32 %rd54, %r2, 4; | |
add.s64 %rd166, %rd43, %rd54; | |
.loc 1 110 16 // layer_norm.py:110:16 | |
add.s64 %rd167, %rd42, %rd54; | |
.loc 1 109 25 // layer_norm.py:109:25 | |
mul.lo.s32 %r807, %r537, %r2; | |
.loc 1 109 13 // layer_norm.py:109:13 | |
mul.wide.s32 %rd55, %r807, 4; | |
add.s64 %rd163, %rd41, %rd55; | |
.loc 1 103 24 // layer_norm.py:103:24 | |
and.b32 %r19, %r4, 31; | |
shr.u32 %r808, %r4, 3; | |
and.b32 %r809, %r808, 12; | |
mov.b32 %r810, global_smem; | |
add.s32 %r1199, %r810, %r809; | |
add.s32 %r1208, %r810, %r673; | |
.loc 1 115 30 // layer_norm.py:115:30 | |
mul.wide.s32 %rd26, %r541, 4; | |
and.b32 %r812, %r4, 127; | |
mul.wide.u32 %rd28, %r812, 16; | |
mul.wide.s32 %rd29, %r538, 4; | |
mul.wide.s32 %rd30, %r537, 4; | |
sub.s32 %r2661, %r3, %r2; | |
mov.b32 %r2790, 0f00000000; | |
setp.eq.s32 %p52, %r4, 0; | |
setp.lt.u32 %p51, %r4, 4; | |
setp.eq.s32 %p50, %r19, 0; | |
mov.b32 %r2791, %r2790; | |
mov.b32 %r2792, %r2790; | |
mov.b32 %r2793, %r2790; | |
mov.b32 %r2794, %r2790; | |
mov.b32 %r2795, %r2790; | |
mov.b32 %r2796, %r2790; | |
mov.b32 %r2797, %r2790; | |
mov.b32 %r2798, %r2790; | |
mov.b32 %r2799, %r2790; | |
mov.b32 %r2800, %r2790; | |
mov.b32 %r2801, %r2790; | |
mov.b32 %r2802, %r2790; | |
mov.b32 %r2803, %r2790; | |
mov.b32 %r2804, %r2790; | |
mov.b32 %r2805, %r2790; | |
mov.b32 %r2806, %r2790; | |
mov.b32 %r2807, %r2790; | |
mov.b32 %r2808, %r2790; | |
mov.b32 %r2809, %r2790; | |
mov.b32 %r2810, %r2790; | |
mov.b32 %r2811, %r2790; | |
mov.b32 %r2812, %r2790; | |
mov.b32 %r2813, %r2790; | |
mov.b32 %r2814, %r2790; | |
mov.b32 %r2815, %r2790; | |
mov.b32 %r2816, %r2790; | |
mov.b32 %r2817, %r2790; | |
mov.b32 %r2818, %r2790; | |
mov.b32 %r2819, %r2790; | |
mov.b32 %r2820, %r2790; | |
mov.b32 %r2821, %r2790; | |
mov.b32 %r2822, %r2790; | |
mov.b32 %r2823, %r2790; | |
mov.b32 %r2824, %r2790; | |
mov.b32 %r2825, %r2790; | |
mov.b32 %r2826, %r2790; | |
mov.b32 %r2827, %r2790; | |
mov.b32 %r2828, %r2790; | |
mov.b32 %r2829, %r2790; | |
mov.b32 %r2830, %r2790; | |
mov.b32 %r2831, %r2790; | |
mov.b32 %r2832, %r2790; | |
mov.b32 %r2833, %r2790; | |
mov.b32 %r2834, %r2790; | |
mov.b32 %r2835, %r2790; | |
mov.b32 %r2836, %r2790; | |
mov.b32 %r2837, %r2790; | |
mov.b32 %r2838, %r2790; | |
mov.b32 %r2839, %r2790; | |
mov.b32 %r2840, %r2790; | |
mov.b32 %r2841, %r2790; | |
mov.b32 %r2842, %r2790; | |
mov.b32 %r2843, %r2790; | |
mov.b32 %r2844, %r2790; | |
mov.b32 %r2845, %r2790; | |
mov.b32 %r2846, %r2790; | |
mov.b32 %r2847, %r2790; | |
mov.b32 %r2848, %r2790; | |
mov.b32 %r2849, %r2790; | |
mov.b32 %r2850, %r2790; | |
mov.b32 %r2851, %r2790; | |
mov.b32 %r2852, %r2790; | |
mov.b32 %r2853, %r2790; | |
mov.b32 %r2854, %r2790; | |
mov.b32 %r2855, %r2790; | |
mov.b32 %r2856, %r2790; | |
mov.b32 %r2857, %r2790; | |
mov.b32 %r2858, %r2790; | |
mov.b32 %r2859, %r2790; | |
mov.b32 %r2860, %r2790; | |
mov.b32 %r2861, %r2790; | |
mov.b32 %r2862, %r2790; | |
mov.b32 %r2863, %r2790; | |
mov.b32 %r2864, %r2790; | |
mov.b32 %r2865, %r2790; | |
mov.b32 %r2866, %r2790; | |
mov.b32 %r2867, %r2790; | |
mov.b32 %r2868, %r2790; | |
mov.b32 %r2869, %r2790; | |
mov.b32 %r2870, %r2790; | |
mov.b32 %r2871, %r2790; | |
mov.b32 %r2872, %r2790; | |
mov.b32 %r2873, %r2790; | |
mov.b32 %r2874, %r2790; | |
mov.b32 %r2875, %r2790; | |
mov.b32 %r2876, %r2790; | |
mov.b32 %r2877, %r2790; | |
mov.b32 %r2878, %r2790; | |
mov.b32 %r2879, %r2790; | |
mov.b32 %r2880, %r2790; | |
mov.b32 %r2881, %r2790; | |
mov.b32 %r2882, %r2790; | |
mov.b32 %r2883, %r2790; | |
mov.b32 %r2884, %r2790; | |
mov.b32 %r2885, %r2790; | |
mov.b32 %r2886, %r2790; | |
mov.b32 %r2887, %r2790; | |
mov.b32 %r2888, %r2790; | |
mov.b32 %r2889, %r2790; | |
mov.b32 %r2890, %r2790; | |
mov.b32 %r2891, %r2790; | |
mov.b32 %r2892, %r2790; | |
mov.b32 %r2893, %r2790; | |
mov.b32 %r2894, %r2790; | |
mov.b32 %r2895, %r2790; | |
mov.b32 %r2896, %r2790; | |
mov.b32 %r2897, %r2790; | |
mov.b32 %r2898, %r2790; | |
mov.b32 %r2899, %r2790; | |
mov.b32 %r2900, %r2790; | |
mov.b32 %r2901, %r2790; | |
mov.b32 %r2902, %r2790; | |
mov.b32 %r2903, %r2790; | |
mov.b32 %r2904, %r2790; | |
mov.b32 %r2905, %r2790; | |
mov.b32 %r2906, %r2790; | |
mov.b32 %r2907, %r2790; | |
mov.b32 %r2908, %r2790; | |
mov.b32 %r2909, %r2790; | |
mov.b32 %r2910, %r2790; | |
mov.b32 %r2911, %r2790; | |
mov.b32 %r2912, %r2790; | |
mov.b32 %r2913, %r2790; | |
mov.b32 %r2914, %r2790; | |
mov.b32 %r2915, %r2790; | |
mov.b32 %r2916, %r2790; | |
mov.b32 %r2917, %r2790; | |
$L__BB0_2: // =>This Inner Loop Header: Depth=1 | |
.loc 1 104 18 // layer_norm.py:104:18 | |
setp.lt.s32 %p17, %r2240, %r542; | |
setp.lt.s32 %p9, %r2241, %r542; | |
setp.lt.s32 %p2, %r2242, %r542; | |
.loc 1 116 28 // layer_norm.py:116:28 | |
add.s64 %rd56, %rd163, %rd28; | |
add.s64 %rd57, %rd56, 2048; | |
add.s64 %rd58, %rd56, 4096; | |
add.s64 %rd59, %rd56, 6144; | |
add.s64 %rd60, %rd56, 8192; | |
add.s64 %rd61, %rd56, 10240; | |
add.s64 %rd62, %rd56, 12288; | |
add.s64 %rd63, %rd163, %rd162; | |
add.s64 %rd64, %rd56, 16384; | |
add.s64 %rd65, %rd56, 18432; | |
add.s64 %rd66, %rd56, 20480; | |
add.s64 %rd67, %rd56, 22528; | |
add.s64 %rd68, %rd56, 24576; | |
add.s64 %rd69, %rd56, 26624; | |
add.s64 %rd70, %rd56, 28672; | |
.loc 1 116 20 // layer_norm.py:116:20 | |
add.s64 %rd71, %rd163, %rd161; | |
mov.b32 %r817, 0; | |
// begin inline asm | |
mov.u32 %r2370, %r817; | |
mov.u32 %r2371, %r817; | |
mov.u32 %r2369, %r817; | |
mov.u32 %r2368, %r817; | |
@%p2 ld.global.v4.b32 { %r2370, %r2371, %r2369, %r2368 }, [ %rd56 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2367, %r817; | |
mov.u32 %r2366, %r817; | |
mov.u32 %r2365, %r817; | |
mov.u32 %r2364, %r817; | |
@%p74 ld.global.v4.b32 { %r2367, %r2366, %r2365, %r2364 }, [ %rd57 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2363, %r817; | |
mov.u32 %r2362, %r817; | |
mov.u32 %r2361, %r817; | |
mov.u32 %r2360, %r817; | |
@%p75 ld.global.v4.b32 { %r2363, %r2362, %r2361, %r2360 }, [ %rd58 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2359, %r817; | |
mov.u32 %r2358, %r817; | |
mov.u32 %r2357, %r817; | |
mov.u32 %r2356, %r817; | |
@%p76 ld.global.v4.b32 { %r2359, %r2358, %r2357, %r2356 }, [ %rd59 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2355, %r817; | |
mov.u32 %r2354, %r817; | |
mov.u32 %r2353, %r817; | |
mov.u32 %r2352, %r817; | |
@%p77 ld.global.v4.b32 { %r2355, %r2354, %r2353, %r2352 }, [ %rd60 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2351, %r817; | |
mov.u32 %r2350, %r817; | |
mov.u32 %r2349, %r817; | |
mov.u32 %r2348, %r817; | |
@%p78 ld.global.v4.b32 { %r2351, %r2350, %r2349, %r2348 }, [ %rd61 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2347, %r817; | |
mov.u32 %r2346, %r817; | |
mov.u32 %r2345, %r817; | |
mov.u32 %r2344, %r817; | |
@%p79 ld.global.v4.b32 { %r2347, %r2346, %r2345, %r2344 }, [ %rd62 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2343, %r817; | |
mov.u32 %r2342, %r817; | |
mov.u32 %r2341, %r817; | |
mov.u32 %r2340, %r817; | |
@%p9 ld.global.v4.b32 { %r2343, %r2342, %r2341, %r2340 }, [ %rd63 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2339, %r817; | |
mov.u32 %r2338, %r817; | |
mov.u32 %r2337, %r817; | |
mov.u32 %r2336, %r817; | |
@%p81 ld.global.v4.b32 { %r2339, %r2338, %r2337, %r2336 }, [ %rd64 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2335, %r817; | |
mov.u32 %r2334, %r817; | |
mov.u32 %r2333, %r817; | |
mov.u32 %r2332, %r817; | |
@%p82 ld.global.v4.b32 { %r2335, %r2334, %r2333, %r2332 }, [ %rd65 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2331, %r817; | |
mov.u32 %r2330, %r817; | |
mov.u32 %r2329, %r817; | |
mov.u32 %r2328, %r817; | |
@%p83 ld.global.v4.b32 { %r2331, %r2330, %r2329, %r2328 }, [ %rd66 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2327, %r817; | |
mov.u32 %r2326, %r817; | |
mov.u32 %r2325, %r817; | |
mov.u32 %r2324, %r817; | |
@%p84 ld.global.v4.b32 { %r2327, %r2326, %r2325, %r2324 }, [ %rd67 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2323, %r817; | |
mov.u32 %r2322, %r817; | |
mov.u32 %r2321, %r817; | |
mov.u32 %r2320, %r817; | |
@%p85 ld.global.v4.b32 { %r2323, %r2322, %r2321, %r2320 }, [ %rd68 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2319, %r817; | |
mov.u32 %r2318, %r817; | |
mov.u32 %r2317, %r817; | |
mov.u32 %r2316, %r817; | |
@%p86 ld.global.v4.b32 { %r2319, %r2318, %r2317, %r2316 }, [ %rd69 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2315, %r817; | |
mov.u32 %r2314, %r817; | |
mov.u32 %r2313, %r817; | |
mov.u32 %r2312, %r817; | |
@%p87 ld.global.v4.b32 { %r2315, %r2314, %r2313, %r2312 }, [ %rd70 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2311, %r817; | |
mov.u32 %r2310, %r817; | |
mov.u32 %r2309, %r817; | |
mov.u32 %r2308, %r817; | |
@%p17 ld.global.v4.b32 { %r2311, %r2310, %r2309, %r2308 }, [ %rd71 + 0 ]; | |
// end inline asm | |
.loc 1 117 20 // layer_norm.py:117:20 | |
// begin inline asm | |
mov.u32 %r2243, %r817; | |
mov.u32 %r2244, %r817; | |
mov.u32 %r2245, %r817; | |
mov.u32 %r2246, %r817; | |
@%p2 ld.global.v4.b32 { %r2243, %r2244, %r2245, %r2246 }, [ %rd72 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2247, %r817; | |
mov.u32 %r2248, %r817; | |
mov.u32 %r2249, %r817; | |
mov.u32 %r2250, %r817; | |
@%p74 ld.global.v4.b32 { %r2247, %r2248, %r2249, %r2250 }, [ %rd73 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2251, %r817; | |
mov.u32 %r2252, %r817; | |
mov.u32 %r2253, %r817; | |
mov.u32 %r2254, %r817; | |
@%p75 ld.global.v4.b32 { %r2251, %r2252, %r2253, %r2254 }, [ %rd74 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2255, %r817; | |
mov.u32 %r2256, %r817; | |
mov.u32 %r2257, %r817; | |
mov.u32 %r2258, %r817; | |
@%p76 ld.global.v4.b32 { %r2255, %r2256, %r2257, %r2258 }, [ %rd75 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2259, %r817; | |
mov.u32 %r2260, %r817; | |
mov.u32 %r2261, %r817; | |
mov.u32 %r2262, %r817; | |
@%p77 ld.global.v4.b32 { %r2259, %r2260, %r2261, %r2262 }, [ %rd76 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2263, %r817; | |
mov.u32 %r2264, %r817; | |
mov.u32 %r2265, %r817; | |
mov.u32 %r2266, %r817; | |
@%p78 ld.global.v4.b32 { %r2263, %r2264, %r2265, %r2266 }, [ %rd77 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2267, %r817; | |
mov.u32 %r2268, %r817; | |
mov.u32 %r2269, %r817; | |
mov.u32 %r2270, %r817; | |
@%p79 ld.global.v4.b32 { %r2267, %r2268, %r2269, %r2270 }, [ %rd78 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2271, %r817; | |
mov.u32 %r2272, %r817; | |
mov.u32 %r2273, %r817; | |
mov.u32 %r2274, %r817; | |
@%p9 ld.global.v4.b32 { %r2271, %r2272, %r2273, %r2274 }, [ %rd79 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2275, %r817; | |
mov.u32 %r2276, %r817; | |
mov.u32 %r2277, %r817; | |
mov.u32 %r2278, %r817; | |
@%p81 ld.global.v4.b32 { %r2275, %r2276, %r2277, %r2278 }, [ %rd80 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2279, %r817; | |
mov.u32 %r2280, %r817; | |
mov.u32 %r2281, %r817; | |
mov.u32 %r2282, %r817; | |
@%p82 ld.global.v4.b32 { %r2279, %r2280, %r2281, %r2282 }, [ %rd81 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2283, %r817; | |
mov.u32 %r2284, %r817; | |
mov.u32 %r2285, %r817; | |
mov.u32 %r2286, %r817; | |
@%p83 ld.global.v4.b32 { %r2283, %r2284, %r2285, %r2286 }, [ %rd82 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2287, %r817; | |
mov.u32 %r2288, %r817; | |
mov.u32 %r2289, %r817; | |
mov.u32 %r2290, %r817; | |
@%p84 ld.global.v4.b32 { %r2287, %r2288, %r2289, %r2290 }, [ %rd83 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2291, %r817; | |
mov.u32 %r2292, %r817; | |
mov.u32 %r2293, %r817; | |
mov.u32 %r2294, %r817; | |
@%p85 ld.global.v4.b32 { %r2291, %r2292, %r2293, %r2294 }, [ %rd84 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2295, %r817; | |
mov.u32 %r2296, %r817; | |
mov.u32 %r2297, %r817; | |
mov.u32 %r2298, %r817; | |
@%p86 ld.global.v4.b32 { %r2295, %r2296, %r2297, %r2298 }, [ %rd85 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2299, %r817; | |
mov.u32 %r2300, %r817; | |
mov.u32 %r2301, %r817; | |
mov.u32 %r2302, %r817; | |
@%p87 ld.global.v4.b32 { %r2299, %r2300, %r2301, %r2302 }, [ %rd86 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2303, %r817; | |
mov.u32 %r2304, %r817; | |
mov.u32 %r2305, %r817; | |
mov.u32 %r2306, %r817; | |
@%p17 ld.global.v4.b32 { %r2303, %r2304, %r2305, %r2306 }, [ %rd87 + 0 ]; | |
// end inline asm | |
.loc 1 118 30 // layer_norm.py:118:30 | |
add.s64 %rd88, %rd165, %rd28; | |
add.s64 %rd89, %rd88, 2048; | |
add.s64 %rd90, %rd88, 4096; | |
add.s64 %rd91, %rd88, 6144; | |
add.s64 %rd92, %rd88, 8192; | |
add.s64 %rd93, %rd88, 10240; | |
add.s64 %rd94, %rd88, 12288; | |
add.s64 %rd95, %rd165, %rd162; | |
add.s64 %rd96, %rd88, 16384; | |
add.s64 %rd97, %rd88, 18432; | |
add.s64 %rd98, %rd88, 20480; | |
add.s64 %rd99, %rd88, 22528; | |
add.s64 %rd100, %rd88, 24576; | |
add.s64 %rd101, %rd88, 26624; | |
add.s64 %rd102, %rd88, 28672; | |
.loc 1 118 21 // layer_norm.py:118:21 | |
add.s64 %rd103, %rd165, %rd161; | |
// begin inline asm | |
mov.u32 %r2373, %r817; | |
mov.u32 %r2374, %r817; | |
mov.u32 %r2375, %r817; | |
mov.u32 %r2376, %r817; | |
@%p2 ld.global.v4.b32 { %r2373, %r2374, %r2375, %r2376 }, [ %rd88 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2377, %r817; | |
mov.u32 %r2378, %r817; | |
mov.u32 %r2379, %r817; | |
mov.u32 %r2380, %r817; | |
@%p74 ld.global.v4.b32 { %r2377, %r2378, %r2379, %r2380 }, [ %rd89 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2381, %r817; | |
mov.u32 %r2382, %r817; | |
mov.u32 %r2383, %r817; | |
mov.u32 %r2384, %r817; | |
@%p75 ld.global.v4.b32 { %r2381, %r2382, %r2383, %r2384 }, [ %rd90 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2385, %r817; | |
mov.u32 %r2386, %r817; | |
mov.u32 %r2387, %r817; | |
mov.u32 %r2388, %r817; | |
@%p76 ld.global.v4.b32 { %r2385, %r2386, %r2387, %r2388 }, [ %rd91 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2389, %r817; | |
mov.u32 %r2390, %r817; | |
mov.u32 %r2391, %r817; | |
mov.u32 %r2392, %r817; | |
@%p77 ld.global.v4.b32 { %r2389, %r2390, %r2391, %r2392 }, [ %rd92 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2393, %r817; | |
mov.u32 %r2394, %r817; | |
mov.u32 %r2395, %r817; | |
mov.u32 %r2396, %r817; | |
@%p78 ld.global.v4.b32 { %r2393, %r2394, %r2395, %r2396 }, [ %rd93 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2397, %r817; | |
mov.u32 %r2398, %r817; | |
mov.u32 %r2399, %r817; | |
mov.u32 %r2400, %r817; | |
@%p79 ld.global.v4.b32 { %r2397, %r2398, %r2399, %r2400 }, [ %rd94 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2401, %r817; | |
mov.u32 %r2402, %r817; | |
mov.u32 %r2403, %r817; | |
mov.u32 %r2404, %r817; | |
@%p9 ld.global.v4.b32 { %r2401, %r2402, %r2403, %r2404 }, [ %rd95 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2405, %r817; | |
mov.u32 %r2406, %r817; | |
mov.u32 %r2407, %r817; | |
mov.u32 %r2408, %r817; | |
@%p81 ld.global.v4.b32 { %r2405, %r2406, %r2407, %r2408 }, [ %rd96 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2409, %r817; | |
mov.u32 %r2410, %r817; | |
mov.u32 %r2411, %r817; | |
mov.u32 %r2412, %r817; | |
@%p82 ld.global.v4.b32 { %r2409, %r2410, %r2411, %r2412 }, [ %rd97 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2413, %r817; | |
mov.u32 %r2414, %r817; | |
mov.u32 %r2415, %r817; | |
mov.u32 %r2416, %r817; | |
@%p83 ld.global.v4.b32 { %r2413, %r2414, %r2415, %r2416 }, [ %rd98 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2417, %r817; | |
mov.u32 %r2418, %r817; | |
mov.u32 %r2419, %r817; | |
mov.u32 %r2420, %r817; | |
@%p84 ld.global.v4.b32 { %r2417, %r2418, %r2419, %r2420 }, [ %rd99 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2421, %r817; | |
mov.u32 %r2422, %r817; | |
mov.u32 %r2423, %r817; | |
mov.u32 %r2424, %r817; | |
@%p85 ld.global.v4.b32 { %r2421, %r2422, %r2423, %r2424 }, [ %rd100 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2425, %r817; | |
mov.u32 %r2426, %r817; | |
mov.u32 %r2427, %r817; | |
mov.u32 %r2428, %r817; | |
@%p86 ld.global.v4.b32 { %r2425, %r2426, %r2427, %r2428 }, [ %rd101 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2429, %r817; | |
mov.u32 %r2430, %r817; | |
mov.u32 %r2431, %r817; | |
mov.u32 %r2432, %r817; | |
@%p87 ld.global.v4.b32 { %r2429, %r2430, %r2431, %r2432 }, [ %rd102 + 0 ]; | |
// end inline asm | |
// begin inline asm | |
mov.u32 %r2433, %r817; | |
mov.u32 %r2434, %r817; | |
mov.u32 %r2435, %r817; | |
mov.u32 %r2436, %r817; | |
@%p17 ld.global.v4.b32 { %r2433, %r2434, %r2435, %r2436 }, [ %rd103 + 0 ]; | |
// end inline asm | |
.loc 1 119 23 // layer_norm.py:119:23 | |
// begin inline asm | |
mov.u32 %r2372, 0x0; | |
ld.global.b32 { %r2372 }, [ %rd167 + 0 ]; | |
// end inline asm | |
.loc 1 120 23 // layer_norm.py:120:23 | |
// begin inline asm | |
mov.u32 %r2307, 0x0; | |
ld.global.b32 { %r2307 }, [ %rd166 + 0 ]; | |
// end inline asm | |
$L__tmp1: | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
bar.sync 0; | |
$L__tmp2: | |
.loc 1 127 26 // layer_norm.py:127:26 | |
add.s64 %rd106, %rd164, %rd28; | |
add.s64 %rd107, %rd106, 2048; | |
add.s64 %rd108, %rd106, 4096; | |
add.s64 %rd109, %rd106, 6144; | |
add.s64 %rd110, %rd106, 8192; | |
add.s64 %rd111, %rd106, 10240; | |
add.s64 %rd112, %rd106, 12288; | |
add.s64 %rd113, %rd164, %rd162; | |
add.s64 %rd114, %rd106, 16384; | |
add.s64 %rd115, %rd106, 18432; | |
add.s64 %rd116, %rd106, 20480; | |
add.s64 %rd117, %rd106, 22528; | |
add.s64 %rd118, %rd106, 24576; | |
add.s64 %rd119, %rd106, 26624; | |
add.s64 %rd120, %rd106, 28672; | |
.loc 1 116 20 // layer_norm.py:116:20 | |
add.s64 %rd121, %rd164, %rd161; | |
.loc 1 122 21 // layer_norm.py:122:21 | |
sub.f32 %r1408, %r2371, %r2372; | |
sub.f32 %r1409, %r2370, %r2372; | |
sub.f32 %r1410, %r2369, %r2372; | |
sub.f32 %r1411, %r2368, %r2372; | |
sub.f32 %r1412, %r2367, %r2372; | |
sub.f32 %r1413, %r2366, %r2372; | |
sub.f32 %r1414, %r2365, %r2372; | |
sub.f32 %r1415, %r2364, %r2372; | |
sub.f32 %r1416, %r2363, %r2372; | |
sub.f32 %r1417, %r2362, %r2372; | |
sub.f32 %r1418, %r2361, %r2372; | |
sub.f32 %r1419, %r2360, %r2372; | |
sub.f32 %r1420, %r2359, %r2372; | |
sub.f32 %r1421, %r2358, %r2372; | |
sub.f32 %r1422, %r2357, %r2372; | |
sub.f32 %r1423, %r2356, %r2372; | |
sub.f32 %r1424, %r2355, %r2372; | |
sub.f32 %r1425, %r2354, %r2372; | |
sub.f32 %r1426, %r2353, %r2372; | |
sub.f32 %r1427, %r2352, %r2372; | |
sub.f32 %r1428, %r2351, %r2372; | |
sub.f32 %r1429, %r2350, %r2372; | |
sub.f32 %r1430, %r2349, %r2372; | |
sub.f32 %r1431, %r2348, %r2372; | |
sub.f32 %r1432, %r2347, %r2372; | |
sub.f32 %r1433, %r2346, %r2372; | |
sub.f32 %r1434, %r2345, %r2372; | |
sub.f32 %r1435, %r2344, %r2372; | |
sub.f32 %r1436, %r2343, %r2372; | |
sub.f32 %r1437, %r2342, %r2372; | |
sub.f32 %r1438, %r2341, %r2372; | |
sub.f32 %r1439, %r2340, %r2372; | |
sub.f32 %r1440, %r2339, %r2372; | |
sub.f32 %r1441, %r2338, %r2372; | |
sub.f32 %r1442, %r2337, %r2372; | |
sub.f32 %r1443, %r2336, %r2372; | |
sub.f32 %r1444, %r2335, %r2372; | |
sub.f32 %r1445, %r2334, %r2372; | |
sub.f32 %r1446, %r2333, %r2372; | |
sub.f32 %r1447, %r2332, %r2372; | |
sub.f32 %r1448, %r2331, %r2372; | |
sub.f32 %r1449, %r2330, %r2372; | |
sub.f32 %r1450, %r2329, %r2372; | |
sub.f32 %r1451, %r2328, %r2372; | |
sub.f32 %r1452, %r2327, %r2372; | |
sub.f32 %r1453, %r2326, %r2372; | |
sub.f32 %r1454, %r2325, %r2372; | |
sub.f32 %r1455, %r2324, %r2372; | |
sub.f32 %r1456, %r2323, %r2372; | |
sub.f32 %r1457, %r2322, %r2372; | |
sub.f32 %r1458, %r2321, %r2372; | |
sub.f32 %r1459, %r2320, %r2372; | |
sub.f32 %r1460, %r2319, %r2372; | |
sub.f32 %r1461, %r2318, %r2372; | |
sub.f32 %r1462, %r2317, %r2372; | |
sub.f32 %r1463, %r2316, %r2372; | |
sub.f32 %r1464, %r2315, %r2372; | |
sub.f32 %r1465, %r2314, %r2372; | |
sub.f32 %r1466, %r2313, %r2372; | |
sub.f32 %r1467, %r2312, %r2372; | |
sub.f32 %r1468, %r2311, %r2372; | |
sub.f32 %r1469, %r2310, %r2372; | |
sub.f32 %r1470, %r2309, %r2372; | |
sub.f32 %r1471, %r2308, %r2372; | |
.loc 1 122 29 // layer_norm.py:122:29 | |
mul.f32 %r1472, %r1471, %r2307; | |
mul.f32 %r1473, %r1470, %r2307; | |
mul.f32 %r1474, %r1469, %r2307; | |
mul.f32 %r1475, %r1468, %r2307; | |
mul.f32 %r1476, %r1467, %r2307; | |
mul.f32 %r1477, %r1466, %r2307; | |
mul.f32 %r1478, %r1465, %r2307; | |
mul.f32 %r1479, %r1464, %r2307; | |
mul.f32 %r1480, %r1463, %r2307; | |
mul.f32 %r1481, %r1462, %r2307; | |
mul.f32 %r1482, %r1461, %r2307; | |
mul.f32 %r1483, %r1460, %r2307; | |
mul.f32 %r1484, %r1459, %r2307; | |
mul.f32 %r1485, %r1458, %r2307; | |
mul.f32 %r1486, %r1457, %r2307; | |
mul.f32 %r1487, %r1456, %r2307; | |
mul.f32 %r1488, %r1455, %r2307; | |
mul.f32 %r1489, %r1454, %r2307; | |
mul.f32 %r1490, %r1453, %r2307; | |
mul.f32 %r1491, %r1452, %r2307; | |
mul.f32 %r1492, %r1451, %r2307; | |
mul.f32 %r1493, %r1450, %r2307; | |
mul.f32 %r1494, %r1449, %r2307; | |
mul.f32 %r1495, %r1448, %r2307; | |
mul.f32 %r1496, %r1447, %r2307; | |
mul.f32 %r1497, %r1446, %r2307; | |
mul.f32 %r1498, %r1445, %r2307; | |
mul.f32 %r1499, %r1444, %r2307; | |
mul.f32 %r1500, %r1443, %r2307; | |
mul.f32 %r1501, %r1442, %r2307; | |
mul.f32 %r1502, %r1441, %r2307; | |
mul.f32 %r1503, %r1440, %r2307; | |
mul.f32 %r1504, %r1439, %r2307; | |
mul.f32 %r1505, %r1438, %r2307; | |
mul.f32 %r1506, %r1437, %r2307; | |
mul.f32 %r1507, %r1436, %r2307; | |
mul.f32 %r1508, %r1435, %r2307; | |
mul.f32 %r1509, %r1434, %r2307; | |
mul.f32 %r1510, %r1433, %r2307; | |
mul.f32 %r1511, %r1432, %r2307; | |
mul.f32 %r1512, %r1431, %r2307; | |
mul.f32 %r1513, %r1430, %r2307; | |
mul.f32 %r1514, %r1429, %r2307; | |
mul.f32 %r1515, %r1428, %r2307; | |
mul.f32 %r1516, %r1427, %r2307; | |
mul.f32 %r1517, %r1426, %r2307; | |
mul.f32 %r1518, %r1425, %r2307; | |
mul.f32 %r1519, %r1424, %r2307; | |
mul.f32 %r1520, %r1423, %r2307; | |
mul.f32 %r1521, %r1422, %r2307; | |
mul.f32 %r1522, %r1421, %r2307; | |
mul.f32 %r1523, %r1420, %r2307; | |
mul.f32 %r1524, %r1419, %r2307; | |
mul.f32 %r1525, %r1418, %r2307; | |
mul.f32 %r1526, %r1417, %r2307; | |
mul.f32 %r1527, %r1416, %r2307; | |
mul.f32 %r1528, %r1415, %r2307; | |
mul.f32 %r1529, %r1414, %r2307; | |
mul.f32 %r1530, %r1413, %r2307; | |
mul.f32 %r1531, %r1412, %r2307; | |
mul.f32 %r1532, %r1411, %r2307; | |
mul.f32 %r1533, %r1410, %r2307; | |
mul.f32 %r1534, %r1409, %r2307; | |
mul.f32 %r1535, %r1408, %r2307; | |
.loc 1 123 18 // layer_norm.py:123:18 | |
mul.f32 %r1537, %r2243, %r2373; | |
mul.f32 %r1539, %r2244, %r2374; | |
mul.f32 %r1541, %r2245, %r2375; | |
mul.f32 %r1543, %r2246, %r2376; | |
mul.f32 %r1545, %r2247, %r2377; | |
mul.f32 %r1547, %r2248, %r2378; | |
mul.f32 %r1549, %r2249, %r2379; | |
mul.f32 %r1551, %r2250, %r2380; | |
mul.f32 %r1553, %r2251, %r2381; | |
mul.f32 %r1555, %r2252, %r2382; | |
mul.f32 %r1557, %r2253, %r2383; | |
mul.f32 %r1559, %r2254, %r2384; | |
mul.f32 %r1561, %r2255, %r2385; | |
mul.f32 %r1563, %r2256, %r2386; | |
mul.f32 %r1565, %r2257, %r2387; | |
mul.f32 %r1567, %r2258, %r2388; | |
mul.f32 %r1569, %r2259, %r2389; | |
mul.f32 %r1571, %r2260, %r2390; | |
mul.f32 %r1573, %r2261, %r2391; | |
mul.f32 %r1575, %r2262, %r2392; | |
mul.f32 %r1577, %r2263, %r2393; | |
mul.f32 %r1579, %r2264, %r2394; | |
mul.f32 %r1581, %r2265, %r2395; | |
mul.f32 %r1583, %r2266, %r2396; | |
mul.f32 %r1585, %r2267, %r2397; | |
mul.f32 %r1587, %r2268, %r2398; | |
mul.f32 %r1589, %r2269, %r2399; | |
mul.f32 %r1591, %r2270, %r2400; | |
mul.f32 %r1593, %r2271, %r2401; | |
mul.f32 %r1595, %r2272, %r2402; | |
mul.f32 %r1597, %r2273, %r2403; | |
mul.f32 %r1599, %r2274, %r2404; | |
mul.f32 %r1601, %r2275, %r2405; | |
mul.f32 %r1603, %r2276, %r2406; | |
mul.f32 %r1605, %r2277, %r2407; | |
mul.f32 %r1607, %r2278, %r2408; | |
mul.f32 %r1609, %r2279, %r2409; | |
mul.f32 %r1611, %r2280, %r2410; | |
mul.f32 %r1613, %r2281, %r2411; | |
mul.f32 %r1615, %r2282, %r2412; | |
mul.f32 %r1617, %r2283, %r2413; | |
mul.f32 %r1619, %r2284, %r2414; | |
mul.f32 %r1621, %r2285, %r2415; | |
mul.f32 %r1623, %r2286, %r2416; | |
mul.f32 %r1625, %r2287, %r2417; | |
mul.f32 %r1627, %r2288, %r2418; | |
mul.f32 %r1629, %r2289, %r2419; | |
mul.f32 %r1631, %r2290, %r2420; | |
mul.f32 %r1633, %r2291, %r2421; | |
mul.f32 %r1635, %r2292, %r2422; | |
mul.f32 %r1637, %r2293, %r2423; | |
mul.f32 %r1639, %r2294, %r2424; | |
mul.f32 %r1641, %r2295, %r2425; | |
mul.f32 %r1643, %r2296, %r2426; | |
mul.f32 %r1645, %r2297, %r2427; | |
mul.f32 %r1647, %r2298, %r2428; | |
mul.f32 %r1649, %r2299, %r2429; | |
mul.f32 %r1651, %r2300, %r2430; | |
mul.f32 %r1653, %r2301, %r2431; | |
mul.f32 %r1655, %r2302, %r2432; | |
mul.f32 %r1657, %r2303, %r2433; | |
mul.f32 %r1659, %r2304, %r2434; | |
mul.f32 %r1661, %r2305, %r2435; | |
mul.f32 %r1663, %r2306, %r2436; | |
.loc 1 124 28 // layer_norm.py:124:28 | |
mul.f32 %r1664, %r1539, %r1535; | |
$L__tmp3: | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
fma.rn.f32 %r1665, %r1537, %r1534, %r1664; | |
fma.rn.f32 %r1666, %r1541, %r1533, %r1665; | |
fma.rn.f32 %r1667, %r1543, %r1532, %r1666; | |
fma.rn.f32 %r1668, %r1545, %r1531, %r1667; | |
fma.rn.f32 %r1669, %r1547, %r1530, %r1668; | |
fma.rn.f32 %r1670, %r1549, %r1529, %r1669; | |
fma.rn.f32 %r1671, %r1551, %r1528, %r1670; | |
fma.rn.f32 %r1672, %r1553, %r1527, %r1671; | |
fma.rn.f32 %r1673, %r1555, %r1526, %r1672; | |
fma.rn.f32 %r1674, %r1557, %r1525, %r1673; | |
fma.rn.f32 %r1675, %r1559, %r1524, %r1674; | |
fma.rn.f32 %r1676, %r1561, %r1523, %r1675; | |
fma.rn.f32 %r1677, %r1563, %r1522, %r1676; | |
fma.rn.f32 %r1678, %r1565, %r1521, %r1677; | |
fma.rn.f32 %r1679, %r1567, %r1520, %r1678; | |
fma.rn.f32 %r1680, %r1569, %r1519, %r1679; | |
fma.rn.f32 %r1681, %r1571, %r1518, %r1680; | |
fma.rn.f32 %r1682, %r1573, %r1517, %r1681; | |
fma.rn.f32 %r1683, %r1575, %r1516, %r1682; | |
fma.rn.f32 %r1684, %r1577, %r1515, %r1683; | |
fma.rn.f32 %r1685, %r1579, %r1514, %r1684; | |
fma.rn.f32 %r1686, %r1581, %r1513, %r1685; | |
fma.rn.f32 %r1687, %r1583, %r1512, %r1686; | |
fma.rn.f32 %r1688, %r1585, %r1511, %r1687; | |
fma.rn.f32 %r1689, %r1587, %r1510, %r1688; | |
fma.rn.f32 %r1690, %r1589, %r1509, %r1689; | |
fma.rn.f32 %r1691, %r1591, %r1508, %r1690; | |
fma.rn.f32 %r1692, %r1593, %r1507, %r1691; | |
fma.rn.f32 %r1693, %r1595, %r1506, %r1692; | |
fma.rn.f32 %r1694, %r1597, %r1505, %r1693; | |
fma.rn.f32 %r1695, %r1599, %r1504, %r1694; | |
fma.rn.f32 %r1696, %r1601, %r1503, %r1695; | |
fma.rn.f32 %r1697, %r1603, %r1502, %r1696; | |
fma.rn.f32 %r1698, %r1605, %r1501, %r1697; | |
fma.rn.f32 %r1699, %r1607, %r1500, %r1698; | |
fma.rn.f32 %r1700, %r1609, %r1499, %r1699; | |
fma.rn.f32 %r1701, %r1611, %r1498, %r1700; | |
fma.rn.f32 %r1702, %r1613, %r1497, %r1701; | |
fma.rn.f32 %r1703, %r1615, %r1496, %r1702; | |
fma.rn.f32 %r1704, %r1617, %r1495, %r1703; | |
fma.rn.f32 %r1705, %r1619, %r1494, %r1704; | |
fma.rn.f32 %r1706, %r1621, %r1493, %r1705; | |
fma.rn.f32 %r1707, %r1623, %r1492, %r1706; | |
fma.rn.f32 %r1708, %r1625, %r1491, %r1707; | |
fma.rn.f32 %r1709, %r1627, %r1490, %r1708; | |
fma.rn.f32 %r1710, %r1629, %r1489, %r1709; | |
fma.rn.f32 %r1711, %r1631, %r1488, %r1710; | |
fma.rn.f32 %r1712, %r1633, %r1487, %r1711; | |
fma.rn.f32 %r1713, %r1635, %r1486, %r1712; | |
fma.rn.f32 %r1714, %r1637, %r1485, %r1713; | |
fma.rn.f32 %r1715, %r1639, %r1484, %r1714; | |
fma.rn.f32 %r1716, %r1641, %r1483, %r1715; | |
fma.rn.f32 %r1717, %r1643, %r1482, %r1716; | |
fma.rn.f32 %r1718, %r1645, %r1481, %r1717; | |
fma.rn.f32 %r1719, %r1647, %r1480, %r1718; | |
fma.rn.f32 %r1720, %r1649, %r1479, %r1719; | |
fma.rn.f32 %r1721, %r1651, %r1478, %r1720; | |
fma.rn.f32 %r1722, %r1653, %r1477, %r1721; | |
fma.rn.f32 %r1723, %r1655, %r1476, %r1722; | |
fma.rn.f32 %r1724, %r1657, %r1475, %r1723; | |
fma.rn.f32 %r1725, %r1659, %r1474, %r1724; | |
fma.rn.f32 %r1726, %r1661, %r1473, %r1725; | |
fma.rn.f32 %r2437, %r1663, %r1472, %r1726; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
shfl.sync.bfly.b32 %r2438, %r2437, 16, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %r2439, %r2437, %r2438; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
shfl.sync.bfly.b32 %r2440, %r2439, 8, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %r2441, %r2439, %r2440; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
shfl.sync.bfly.b32 %r2442, %r2441, 4, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %r2443, %r2441, %r2442; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
shfl.sync.bfly.b32 %r2444, %r2443, 2, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %r2445, %r2443, %r2444; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
shfl.sync.bfly.b32 %r2446, %r2445, 1, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %r2447, %r2445, %r2446; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
// begin inline asm | |
@%p50 st.shared.b32 [ %r1199 + 0 ], %r2447; | |
// end inline asm | |
bar.sync 0; | |
// begin inline asm | |
@%p51 ld.shared.b32 %r2448, [ %r1208 + 0 ]; | |
// end inline asm | |
shfl.sync.bfly.b32 %r2449, %r2448, 2, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %r2450, %r2448, %r2449; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
shfl.sync.bfly.b32 %r2451, %r2450, 1, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ] | |
add.f32 %r2452, %r2450, %r2451; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ] | |
// begin inline asm | |
@%p52 st.shared.b32 [ %r1208 + 0 ], %r2452; | |
// end inline asm | |
bar.sync 0; | |
ld.shared.b32 %r1756, [global_smem]; | |
$L__tmp4: | |
.loc 1 124 43 // layer_norm.py:124:43 | |
div.full.f32 %r1757, %r1756, %r18; | |
$L__tmp5: | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
bar.sync 0; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
fma.rn.f32 %r1758, %r2243, %r2373, %r1539; | |
fma.rn.f32 %r1759, %r2245, %r2375, %r1758; | |
fma.rn.f32 %r1760, %r2246, %r2376, %r1759; | |
fma.rn.f32 %r1761, %r2247, %r2377, %r1760; | |
fma.rn.f32 %r1762, %r2248, %r2378, %r1761; | |
fma.rn.f32 %r1763, %r2249, %r2379, %r1762; | |
fma.rn.f32 %r1764, %r2250, %r2380, %r1763; | |
fma.rn.f32 %r1765, %r2251, %r2381, %r1764; | |
fma.rn.f32 %r1766, %r2252, %r2382, %r1765; | |
fma.rn.f32 %r1767, %r2253, %r2383, %r1766; | |
fma.rn.f32 %r1768, %r2254, %r2384, %r1767; | |
fma.rn.f32 %r1769, %r2255, %r2385, %r1768; | |
fma.rn.f32 %r1770, %r2256, %r2386, %r1769; | |
fma.rn.f32 %r1771, %r2257, %r2387, %r1770; | |
fma.rn.f32 %r1772, %r2258, %r2388, %r1771; | |
fma.rn.f32 %r1773, %r2259, %r2389, %r1772; | |
fma.rn.f32 %r1774, %r2260, %r2390, %r1773; | |
fma.rn.f32 %r1775, %r2261, %r2391, %r1774; | |
fma.rn.f32 %r1776, %r2262, %r2392, %r1775; | |
fma.rn.f32 %r1777, %r2263, %r2393, %r1776; | |
fma.rn.f32 %r1778, %r2264, %r2394, %r1777; | |
fma.rn.f32 %r1779, %r2265, %r2395, %r1778; | |
fma.rn.f32 %r1780, %r2266, %r2396, %r1779; | |
fma.rn.f32 %r1781, %r2267, %r2397, %r1780; | |
fma.rn.f32 %r1782, %r2268, %r2398, %r1781; | |
fma.rn.f32 %r1783, %r2269, %r2399, %r1782; | |
fma.rn.f32 %r1784, %r2270, %r2400, %r1783; | |
fma.rn.f32 %r1785, %r2271, %r2401, %r1784; | |
fma.rn.f32 %r1786, %r2272, %r2402, %r1785; | |
fma.rn.f32 %r1787, %r2273, %r2403, %r1786; | |
fma.rn.f32 %r1788, %r2274, %r2404, %r1787; | |
fma.rn.f32 %r1789, %r2275, %r2405, %r1788; | |
fma.rn.f32 %r1790, %r2276, %r2406, %r1789; | |
fma.rn.f32 %r1791, %r2277, %r2407, %r1790; | |
fma.rn.f32 %r1792, %r2278, %r2408, %r1791; | |
fma.rn.f32 %r1793, %r2279, %r2409, %r1792; | |
fma.rn.f32 %r1794, %r2280, %r2410, %r1793; | |
fma.rn.f32 %r1795, %r2281, %r2411, %r1794; | |
fma.rn.f32 %r1796, %r2282, %r2412, %r1795; | |
fma.rn.f32 %r1797, %r2283, %r2413, %r1796; | |
fma.rn.f32 %r1798, %r2284, %r2414, %r1797; | |
fma.rn.f32 %r1799, %r2285, %r2415, %r1798; | |
fma.rn.f32 %r1800, %r2286, %r2416, %r1799; | |
fma.rn.f32 %r1801, %r2287, %r2417, %r1800; | |
fma.rn.f32 %r1802, %r2288, %r2418, %r1801; | |
fma.rn.f32 %r1803, %r2289, %r2419, %r1802; | |
fma.rn.f32 %r1804, %r2290, %r2420, %r1803; | |
fma.rn.f32 %r1805, %r2291, %r2421, %r1804; | |
fma.rn.f32 %r1806, %r2292, %r2422, %r1805; | |
fma.rn.f32 %r1807, %r2293, %r2423, %r1806; | |
fma.rn.f32 %r1808, %r2294, %r2424, %r1807; | |
fma.rn.f32 %r1809, %r2295, %r2425, %r1808; | |
fma.rn.f32 %r1810, %r2296, %r2426, %r1809; | |
fma.rn.f32 %r1811, %r2297, %r2427, %r1810; | |
fma.rn.f32 %r1812, %r2298, %r2428, %r1811; | |
fma.rn.f32 %r1813, %r2299, %r2429, %r1812; | |
fma.rn.f32 %r1814, %r2300, %r2430, %r1813; | |
fma.rn.f32 %r1815, %r2301, %r2431, %r1814; | |
fma.rn.f32 %r1816, %r2302, %r2432, %r1815; | |
fma.rn.f32 %r1817, %r2303, %r2433, %r1816; | |
fma.rn.f32 %r1818, %r2304, %r2434, %r1817; | |
fma.rn.f32 %r1819, %r2305, %r2435, %r1818; | |
fma.rn.f32 %r2453, %r2306, %r2436, %r1819; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
shfl.sync.bfly.b32 %r2454, %r2453, 16, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %r2455, %r2453, %r2454; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
shfl.sync.bfly.b32 %r2456, %r2455, 8, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %r2457, %r2455, %r2456; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
shfl.sync.bfly.b32 %r2458, %r2457, 4, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %r2459, %r2457, %r2458; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
shfl.sync.bfly.b32 %r2460, %r2459, 2, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %r2461, %r2459, %r2460; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
shfl.sync.bfly.b32 %r2462, %r2461, 1, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %r2463, %r2461, %r2462; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
// begin inline asm | |
@%p50 st.shared.b32 [ %r1199 + 0 ], %r2463; | |
// end inline asm | |
bar.sync 0; | |
// begin inline asm | |
@%p51 ld.shared.b32 %r2464, [ %r1208 + 0 ]; | |
// end inline asm | |
shfl.sync.bfly.b32 %r2465, %r2464, 2, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %r2466, %r2464, %r2465; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
shfl.sync.bfly.b32 %r2467, %r2466, 1, 31, -1; | |
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ] | |
add.f32 %r2468, %r2466, %r2467; | |
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ] | |
// begin inline asm | |
@%p52 st.shared.b32 [ %r1208 + 0 ], %r2468; | |
// end inline asm | |
bar.sync 0; | |
ld.shared.b32 %r1849, [global_smem]; | |
$L__tmp6: | |
.loc 1 125 35 // layer_norm.py:125:35 | |
div.full.f32 %r1850, %r1849, %r18; | |
.loc 1 126 34 // layer_norm.py:126:34 | |
fma.rn.f32 %r1851, %r1534, %r1757, %r1850; | |
fma.rn.f32 %r1852, %r1535, %r1757, %r1850; | |
fma.rn.f32 %r1853, %r1533, %r1757, %r1850; | |
fma.rn.f32 %r1854, %r1532, %r1757, %r1850; | |
fma.rn.f32 %r1855, %r1531, %r1757, %r1850; | |
fma.rn.f32 %r1856, %r1530, %r1757, %r1850; | |
fma.rn.f32 %r1857, %r1529, %r1757, %r1850; | |
fma.rn.f32 %r1858, %r1528, %r1757, %r1850; | |
fma.rn.f32 %r1859, %r1527, %r1757, %r1850; | |
fma.rn.f32 %r1860, %r1526, %r1757, %r1850; | |
fma.rn.f32 %r1861, %r1525, %r1757, %r1850; | |
fma.rn.f32 %r1862, %r1524, %r1757, %r1850; | |
fma.rn.f32 %r1863, %r1523, %r1757, %r1850; | |
fma.rn.f32 %r1864, %r1522, %r1757, %r1850; | |
fma.rn.f32 %r1865, %r1521, %r1757, %r1850; | |
fma.rn.f32 %r1866, %r1520, %r1757, %r1850; | |
fma.rn.f32 %r1867, %r1519, %r1757, %r1850; | |
fma.rn.f32 %r1868, %r1518, %r1757, %r1850; | |
fma.rn.f32 %r1869, %r1517, %r1757, %r1850; | |
fma.rn.f32 %r1870, %r1516, %r1757, %r1850; | |
fma.rn.f32 %r1871, %r1515, %r1757, %r1850; | |
fma.rn.f32 %r1872, %r1514, %r1757, %r1850; | |
fma.rn.f32 %r1873, %r1513, %r1757, %r1850; | |
fma.rn.f32 %r1874, %r1512, %r1757, %r1850; | |
fma.rn.f32 %r1875, %r1511, %r1757, %r1850; | |
fma.rn.f32 %r1876, %r1510, %r1757, %r1850; | |
fma.rn.f32 %r1877, %r1509, %r1757, %r1850; | |
fma.rn.f32 %r1878, %r1508, %r1757, %r1850; | |
fma.rn.f32 %r1879, %r1507, %r1757, %r1850; | |
fma.rn.f32 %r1880, %r1506, %r1757, %r1850; | |
fma.rn.f32 %r1881, %r1505, %r1757, %r1850; | |
fma.rn.f32 %r1882, %r1504, %r1757, %r1850; | |
fma.rn.f32 %r1883, %r1503, %r1757, %r1850; | |
fma.rn.f32 %r1884, %r1502, %r1757, %r1850; | |
fma.rn.f32 %r1885, %r1501, %r1757, %r1850; | |
fma.rn.f32 %r1886, %r1500, %r1757, %r1850; | |
fma.rn.f32 %r1887, %r1499, %r1757, %r1850; | |
fma.rn.f32 %r1888, %r1498, %r1757, %r1850; | |
fma.rn.f32 %r1889, %r1497, %r1757, %r1850; | |
fma.rn.f32 %r1890, %r1496, %r1757, %r1850; | |
fma.rn.f32 %r1891, %r1495, %r1757, %r1850; | |
fma.rn.f32 %r1892, %r1494, %r1757, %r1850; | |
fma.rn.f32 %r1893, %r1493, %r1757, %r1850; | |
fma.rn.f32 %r1894, %r1492, %r1757, %r1850; | |
fma.rn.f32 %r1895, %r1491, %r1757, %r1850; | |
fma.rn.f32 %r1896, %r1490, %r1757, %r1850; | |
fma.rn.f32 %r1897, %r1489, %r1757, %r1850; | |
fma.rn.f32 %r1898, %r1488, %r1757, %r1850; | |
fma.rn.f32 %r1899, %r1487, %r1757, %r1850; | |
fma.rn.f32 %r1900, %r1486, %r1757, %r1850; | |
fma.rn.f32 %r1901, %r1485, %r1757, %r1850; | |
fma.rn.f32 %r1902, %r1484, %r1757, %r1850; | |
fma.rn.f32 %r1903, %r1483, %r1757, %r1850; | |
fma.rn.f32 %r1904, %r1482, %r1757, %r1850; | |
fma.rn.f32 %r1905, %r1481, %r1757, %r1850; | |
fma.rn.f32 %r1906, %r1480, %r1757, %r1850; | |
fma.rn.f32 %r1907, %r1479, %r1757, %r1850; | |
fma.rn.f32 %r1908, %r1478, %r1757, %r1850; | |
fma.rn.f32 %r1909, %r1477, %r1757, %r1850; | |
fma.rn.f32 %r1910, %r1476, %r1757, %r1850; | |
fma.rn.f32 %r1911, %r1475, %r1757, %r1850; | |
fma.rn.f32 %r1912, %r1474, %r1757, %r1850; | |
fma.rn.f32 %r1913, %r1473, %r1757, %r1850; | |
fma.rn.f32 %r1914, %r1472, %r1757, %r1850; | |
.loc 1 126 21 // layer_norm.py:126:21 | |
neg.f32 %r1915, %r1851; | |
fma.rn.f32 %r1916, %r2243, %r2373, %r1915; | |
neg.f32 %r1917, %r1852; | |
fma.rn.f32 %r1918, %r2244, %r2374, %r1917; | |
neg.f32 %r1919, %r1853; | |
fma.rn.f32 %r1920, %r2245, %r2375, %r1919; | |
neg.f32 %r1921, %r1854; | |
fma.rn.f32 %r1922, %r2246, %r2376, %r1921; | |
neg.f32 %r1923, %r1855; | |
fma.rn.f32 %r1924, %r2247, %r2377, %r1923; | |
neg.f32 %r1925, %r1856; | |
fma.rn.f32 %r1926, %r2248, %r2378, %r1925; | |
neg.f32 %r1927, %r1857; | |
fma.rn.f32 %r1928, %r2249, %r2379, %r1927; | |
neg.f32 %r1929, %r1858; | |
fma.rn.f32 %r1930, %r2250, %r2380, %r1929; | |
neg.f32 %r1931, %r1859; | |
fma.rn.f32 %r1932, %r2251, %r2381, %r1931; | |
neg.f32 %r1933, %r1860; | |
fma.rn.f32 %r1934, %r2252, %r2382, %r1933; | |
neg.f32 %r1935, %r1861; | |
fma.rn.f32 %r1936, %r2253, %r2383, %r1935; | |
neg.f32 %r1937, %r1862; | |
fma.rn.f32 %r1938, %r2254, %r2384, %r1937; | |
neg.f32 %r1939, %r1863; | |
fma.rn.f32 %r1940, %r2255, %r2385, %r1939; | |
neg.f32 %r1941, %r1864; | |
fma.rn.f32 %r1942, %r2256, %r2386, %r1941; | |
neg.f32 %r1943, %r1865; | |
fma.rn.f32 %r1944, %r2257, %r2387, %r1943; | |
neg.f32 %r1945, %r1866; | |
fma.rn.f32 %r1946, %r2258, %r2388, %r1945; | |
neg.f32 %r1947, %r1867; | |
fma.rn.f32 %r1948, %r2259, %r2389, %r1947; | |
neg.f32 %r1949, %r1868; | |
fma.rn.f32 %r1950, %r2260, %r2390, %r1949; | |
neg.f32 %r1951, %r1869; | |
fma.rn.f32 %r1952, %r2261, %r2391, %r1951; | |
neg.f32 %r1953, %r1870; | |
fma.rn.f32 %r1954, %r2262, %r2392, %r1953; | |
neg.f32 %r1955, %r1871; | |
fma.rn.f32 %r1956, %r2263, %r2393, %r1955; | |
neg.f32 %r1957, %r1872; | |
fma.rn.f32 %r1958, %r2264, %r2394, %r1957; | |
neg.f32 %r1959, %r1873; | |
fma.rn.f32 %r1960, %r2265, %r2395, %r1959; | |
neg.f32 %r1961, %r1874; | |
fma.rn.f32 %r1962, %r2266, %r2396, %r1961; | |
neg.f32 %r1963, %r1875; | |
fma.rn.f32 %r1964, %r2267, %r2397, %r1963; | |
neg.f32 %r1965, %r1876; | |
fma.rn.f32 %r1966, %r2268, %r2398, %r1965; | |
neg.f32 %r1967, %r1877; | |
fma.rn.f32 %r1968, %r2269, %r2399, %r1967; | |
neg.f32 %r1969, %r1878; | |
fma.rn.f32 %r1970, %r2270, %r2400, %r1969; | |
neg.f32 %r1971, %r1879; | |
fma.rn.f32 %r1972, %r2271, %r2401, %r1971; | |
neg.f32 %r1973, %r1880; | |
fma.rn.f32 %r1974, %r2272, %r2402, %r1973; | |
neg.f32 %r1975, %r1881; | |
fma.rn.f32 %r1976, %r2273, %r2403, %r1975; | |
neg.f32 %r1977, %r1882; | |
fma.rn.f32 %r1978, %r2274, %r2404, %r1977; | |
neg.f32 %r1979, %r1883; | |
fma.rn.f32 %r1980, %r2275, %r2405, %r1979; | |
neg.f32 %r1981, %r1884; | |
fma.rn.f32 %r1982, %r2276, %r2406, %r1981; | |
neg.f32 %r1983, %r1885; | |
fma.rn.f32 %r1984, %r2277, %r2407, %r1983; | |
neg.f32 %r1985, %r1886; | |
fma.rn.f32 %r1986, %r2278, %r2408, %r1985; | |
neg.f32 %r1987, %r1887; | |
fma.rn.f32 %r1988, %r2279, %r2409, %r1987; | |
neg.f32 %r1989, %r1888; | |
fma.rn.f32 %r1990, %r2280, %r2410, %r1989; | |
neg.f32 %r1991, %r1889; | |
fma.rn.f32 %r1992, %r2281, %r2411, %r1991; | |
neg.f32 %r1993, %r1890; | |
fma.rn.f32 %r1994, %r2282, %r2412, %r1993; | |
neg.f32 %r1995, %r1891; | |
fma.rn.f32 %r1996, %r2283, %r2413, %r1995; | |
neg.f32 %r1997, %r1892; | |
fma.rn.f32 %r1998, %r2284, %r2414, %r1997; | |
neg.f32 %r1999, %r1893; | |
fma.rn.f32 %r2000, %r2285, %r2415, %r1999; | |
neg.f32 %r2001, %r1894; | |
fma.rn.f32 %r2002, %r2286, %r2416, %r2001; | |
neg.f32 %r2003, %r1895; | |
fma.rn.f32 %r2004, %r2287, %r2417, %r2003; | |
neg.f32 %r2005, %r1896; | |
fma.rn.f32 %r2006, %r2288, %r2418, %r2005; | |
neg.f32 %r2007, %r1897; | |
fma.rn.f32 %r2008, %r2289, %r2419, %r2007; | |
neg.f32 %r2009, %r1898; | |
fma.rn.f32 %r2010, %r2290, %r2420, %r2009; | |
neg.f32 %r2011, %r1899; | |
fma.rn.f32 %r2012, %r2291, %r2421, %r2011; | |
neg.f32 %r2013, %r1900; | |
fma.rn.f32 %r2014, %r2292, %r2422, %r2013; | |
neg.f32 %r2015, %r1901; | |
fma.rn.f32 %r2016, %r2293, %r2423, %r2015; | |
neg.f32 %r2017, %r1902; | |
fma.rn.f32 %r2018, %r2294, %r2424, %r2017; | |
neg.f32 %r2019, %r1903; | |
fma.rn.f32 %r2020, %r2295, %r2425, %r2019; | |
neg.f32 %r2021, %r1904; | |
fma.rn.f32 %r2022, %r2296, %r2426, %r2021; | |
neg.f32 %r2023, %r1905; | |
fma.rn.f32 %r2024, %r2297, %r2427, %r2023; | |
neg.f32 %r2025, %r1906; | |
fma.rn.f32 %r2026, %r2298, %r2428, %r2025; | |
neg.f32 %r2027, %r1907; | |
fma.rn.f32 %r2028, %r2299, %r2429, %r2027; | |
neg.f32 %r2029, %r1908; | |
fma.rn.f32 %r2030, %r2300, %r2430, %r2029; | |
neg.f32 %r2031, %r1909; | |
fma.rn.f32 %r2032, %r2301, %r2431, %r2031; | |
neg.f32 %r2033, %r1910; | |
fma.rn.f32 %r2034, %r2302, %r2432, %r2033; | |
neg.f32 %r2035, %r1911; | |
fma.rn.f32 %r2036, %r2303, %r2433, %r2035; | |
neg.f32 %r2037, %r1912; | |
fma.rn.f32 %r2038, %r2304, %r2434, %r2037; | |
neg.f32 %r2039, %r1913; | |
fma.rn.f32 %r2040, %r2305, %r2435, %r2039; | |
neg.f32 %r2041, %r1914; | |
fma.rn.f32 %r2042, %r2306, %r2436, %r2041; | |
.loc 1 126 41 // layer_norm.py:126:41 | |
mul.f32 %r2469, %r1916, %r2307; | |
mul.f32 %r2470, %r1918, %r2307; | |
mul.f32 %r2471, %r1920, %r2307; | |
mul.f32 %r2472, %r1922, %r2307; | |
mul.f32 %r2473, %r1924, %r2307; | |
mul.f32 %r2474, %r1926, %r2307; | |
mul.f32 %r2475, %r1928, %r2307; | |
mul.f32 %r2476, %r1930, %r2307; | |
mul.f32 %r2477, %r1932, %r2307; | |
mul.f32 %r2478, %r1934, %r2307; | |
mul.f32 %r2479, %r1936, %r2307; | |
mul.f32 %r2480, %r1938, %r2307; | |
mul.f32 %r2481, %r1940, %r2307; | |
mul.f32 %r2482, %r1942, %r2307; | |
mul.f32 %r2483, %r1944, %r2307; | |
mul.f32 %r2484, %r1946, %r2307; | |
mul.f32 %r2485, %r1948, %r2307; | |
mul.f32 %r2486, %r1950, %r2307; | |
mul.f32 %r2487, %r1952, %r2307; | |
mul.f32 %r2488, %r1954, %r2307; | |
mul.f32 %r2489, %r1956, %r2307; | |
mul.f32 %r2490, %r1958, %r2307; | |
mul.f32 %r2491, %r1960, %r2307; | |
mul.f32 %r2492, %r1962, %r2307; | |
mul.f32 %r2493, %r1964, %r2307; | |
mul.f32 %r2494, %r1966, %r2307; | |
mul.f32 %r2495, %r1968, %r2307; | |
mul.f32 %r2496, %r1970, %r2307; | |
mul.f32 %r2497, %r1972, %r2307; | |
mul.f32 %r2498, %r1974, %r2307; | |
mul.f32 %r2499, %r1976, %r2307; | |
mul.f32 %r2500, %r1978, %r2307; | |
mul.f32 %r2501, %r1980, %r2307; | |
mul.f32 %r2502, %r1982, %r2307; | |
mul.f32 %r2503, %r1984, %r2307; | |
mul.f32 %r2504, %r1986, %r2307; | |
mul.f32 %r2505, %r1988, %r2307; | |
mul.f32 %r2506, %r1990, %r2307; | |
mul.f32 %r2507, %r1992, %r2307; | |
mul.f32 %r2508, %r1994, %r2307; | |
mul.f32 %r2509, %r1996, %r2307; | |
mul.f32 %r2510, %r1998, %r2307; | |
mul.f32 %r2511, %r2000, %r2307; | |
mul.f32 %r2512, %r2002, %r2307; | |
mul.f32 %r2513, %r2004, %r2307; | |
mul.f32 %r2514, %r2006, %r2307; | |
mul.f32 %r2515, %r2008, %r2307; | |
mul.f32 %r2516, %r2010, %r2307; | |
mul.f32 %r2517, %r2012, %r2307; | |
mul.f32 %r2518, %r2014, %r2307; | |
mul.f32 %r2519, %r2016, %r2307; | |
mul.f32 %r2520, %r2018, %r2307; | |
mul.f32 %r2521, %r2020, %r2307; | |
mul.f32 %r2522, %r2022, %r2307; | |
mul.f32 %r2523, %r2024, %r2307; | |
mul.f32 %r2524, %r2026, %r2307; | |
mul.f32 %r2525, %r2028, %r2307; | |
mul.f32 %r2526, %r2030, %r2307; | |
mul.f32 %r2527, %r2032, %r2307; | |
mul.f32 %r2528, %r2034, %r2307; | |
mul.f32 %r2529, %r2036, %r2307; | |
mul.f32 %r2530, %r2038, %r2307; | |
mul.f32 %r2531, %r2040, %r2307; | |
mul.f32 %r2532, %r2042, %r2307; | |
.loc 1 127 32 // layer_norm.py:127:32 | |
// begin inline asm | |
@%p2 st.global.v4.b32 [ %rd106 + 0 ], { %r2469, %r2470, %r2471, %r2472 }; | |
// end inline asm | |
// begin inline asm | |
@%p74 st.global.v4.b32 [ %rd107 + 0 ], { %r2473, %r2474, %r2475, %r2476 }; | |
// end inline asm | |
// begin inline asm | |
@%p75 st.global.v4.b32 [ %rd108 + 0 ], { %r2477, %r2478, %r2479, %r2480 }; | |
// end inline asm | |
// begin inline asm | |
@%p76 st.global.v4.b32 [ %rd109 + 0 ], { %r2481, %r2482, %r2483, %r2484 }; | |
// end inline asm | |
// begin inline asm | |
@%p77 st.global.v4.b32 [ %rd110 + 0 ], { %r2485, %r2486, %r2487, %r2488 }; | |
// end inline asm | |
// begin inline asm | |
@%p78 st.global.v4.b32 [ %rd111 + 0 ], { %r2489, %r2490, %r2491, %r2492 }; | |
// end inline asm | |
// begin inline asm | |
@%p79 st.global.v4.b32 [ %rd112 + 0 ], { %r2493, %r2494, %r2495, %r2496 }; | |
// end inline asm | |
// begin inline asm | |
@%p9 st.global.v4.b32 [ %rd113 + 0 ], { %r2497, %r2498, %r2499, %r2500 }; | |
// end inline asm | |
// begin inline asm | |
@%p81 st.global.v4.b32 [ %rd114 + 0 ], { %r2501, %r2502, %r2503, %r2504 }; | |
// end inline asm | |
// begin inline asm | |
@%p82 st.global.v4.b32 [ %rd115 + 0 ], { %r2505, %r2506, %r2507, %r2508 }; | |
// end inline asm | |
// begin inline asm | |
@%p83 st.global.v4.b32 [ %rd116 + 0 ], { %r2509, %r2510, %r2511, %r2512 }; | |
// end inline asm | |
// begin inline asm | |
@%p84 st.global.v4.b32 [ %rd117 + 0 ], { %r2513, %r2514, %r2515, %r2516 }; | |
// end inline asm | |
// begin inline asm | |
@%p85 st.global.v4.b32 [ %rd118 + 0 ], { %r2517, %r2518, %r2519, %r2520 }; | |
// end inline asm | |
// begin inline asm | |
@%p86 st.global.v4.b32 [ %rd119 + 0 ], { %r2521, %r2522, %r2523, %r2524 }; | |
// end inline asm | |
// begin inline asm | |
@%p87 st.global.v4.b32 [ %rd120 + 0 ], { %r2525, %r2526, %r2527, %r2528 }; | |
// end inline asm | |
// begin inline asm | |
@%p17 st.global.v4.b32 [ %rd121 + 0 ], { %r2529, %r2530, %r2531, %r2532 }; | |
// end inline asm | |
.loc 1 129 18 // layer_norm.py:129:18 | |
fma.rn.f32 %r2844, %r1481, %r2427, %r2844; | |
fma.rn.f32 %r2843, %r1482, %r2426, %r2843; | |
fma.rn.f32 %r2842, %r1483, %r2425, %r2842; | |
fma.rn.f32 %r2841, %r1484, %r2424, %r2841; | |
fma.rn.f32 %r2840, %r1485, %r2423, %r2840; | |
fma.rn.f32 %r2839, %r1486, %r2422, %r2839; | |
fma.rn.f32 %r2838, %r1487, %r2421, %r2838; | |
fma.rn.f32 %r2837, %r1488, %r2420, %r2837; | |
fma.rn.f32 %r2836, %r1489, %r2419, %r2836; | |
fma.rn.f32 %r2835, %r1490, %r2418, %r2835; | |
fma.rn.f32 %r2834, %r1491, %r2417, %r2834; | |
fma.rn.f32 %r2833, %r1492, %r2416, %r2833; | |
fma.rn.f32 %r2832, %r1493, %r2415, %r2832; | |
fma.rn.f32 %r2831, %r1494, %r2414, %r2831; | |
fma.rn.f32 %r2830, %r1495, %r2413, %r2830; | |
fma.rn.f32 %r2829, %r1496, %r2412, %r2829; | |
fma.rn.f32 %r2828, %r1497, %r2411, %r2828; | |
fma.rn.f32 %r2827, %r1498, %r2410, %r2827; | |
fma.rn.f32 %r2826, %r1499, %r2409, %r2826; | |
fma.rn.f32 %r2825, %r1500, %r2408, %r2825; | |
fma.rn.f32 %r2824, %r1501, %r2407, %r2824; | |
fma.rn.f32 %r2823, %r1502, %r2406, %r2823; | |
fma.rn.f32 %r2822, %r1503, %r2405, %r2822; | |
fma.rn.f32 %r2821, %r1504, %r2404, %r2821; | |
fma.rn.f32 %r2820, %r1505, %r2403, %r2820; | |
fma.rn.f32 %r2819, %r1506, %r2402, %r2819; | |
fma.rn.f32 %r2818, %r1507, %r2401, %r2818; | |
fma.rn.f32 %r2817, %r1508, %r2400, %r2817; | |
fma.rn.f32 %r2816, %r1509, %r2399, %r2816; | |
fma.rn.f32 %r2815, %r1510, %r2398, %r2815; | |
fma.rn.f32 %r2814, %r1511, %r2397, %r2814; | |
fma.rn.f32 %r2813, %r1512, %r2396, %r2813; | |
fma.rn.f32 %r2812, %r1513, %r2395, %r2812; | |
fma.rn.f32 %r2811, %r1514, %r2394, %r2811; | |
fma.rn.f32 %r2810, %r1515, %r2393, %r2810; | |
fma.rn.f32 %r2809, %r1516, %r2392, %r2809; | |
fma.rn.f32 %r2808, %r1517, %r2391, %r2808; | |
fma.rn.f32 %r2807, %r1518, %r2390, %r2807; | |
fma.rn.f32 %r2806, %r1519, %r2389, %r2806; | |
fma.rn.f32 %r2805, %r1520, %r2388, %r2805; | |
fma.rn.f32 %r2804, %r1521, %r2387, %r2804; | |
fma.rn.f32 %r2803, %r1522, %r2386, %r2803; | |
fma.rn.f32 %r2802, %r1523, %r2385, %r2802; | |
fma.rn.f32 %r2801, %r1524, %r2384, %r2801; | |
fma.rn.f32 %r2800, %r1525, %r2383, %r2800; | |
fma.rn.f32 %r2799, %r1526, %r2382, %r2799; | |
fma.rn.f32 %r2798, %r1527, %r2381, %r2798; | |
fma.rn.f32 %r2797, %r1528, %r2380, %r2797; | |
fma.rn.f32 %r2796, %r1529, %r2379, %r2796; | |
fma.rn.f32 %r2795, %r1530, %r2378, %r2795; | |
fma.rn.f32 %r2794, %r1531, %r2377, %r2794; | |
fma.rn.f32 %r2793, %r1532, %r2376, %r2793; | |
fma.rn.f32 %r2792, %r1533, %r2375, %r2792; | |
fma.rn.f32 %r2791, %r1535, %r2374, %r2791; | |
fma.rn.f32 %r2790, %r1534, %r2373, %r2790; | |
fma.rn.f32 %r2845, %r1480, %r2428, %r2845; | |
fma.rn.f32 %r2846, %r1479, %r2429, %r2846; | |
fma.rn.f32 %r2847, %r1478, %r2430, %r2847; | |
fma.rn.f32 %r2848, %r1477, %r2431, %r2848; | |
fma.rn.f32 %r2849, %r1476, %r2432, %r2849; | |
fma.rn.f32 %r2850, %r1475, %r2433, %r2850; | |
fma.rn.f32 %r2851, %r1474, %r2434, %r2851; | |
fma.rn.f32 %r2852, %r1473, %r2435, %r2852; | |
fma.rn.f32 %r2853, %r1472, %r2436, %r2853; | |
add.f32 %r2854, %r2854, %r2373; | |
add.f32 %r2855, %r2855, %r2374; | |
add.f32 %r2856, %r2856, %r2375; | |
add.f32 %r2857, %r2857, %r2376; | |
add.f32 %r2858, %r2858, %r2377; | |
add.f32 %r2859, %r2859, %r2378; | |
add.f32 %r2860, %r2860, %r2379; | |
add.f32 %r2861, %r2861, %r2380; | |
add.f32 %r2862, %r2862, %r2381; | |
add.f32 %r2863, %r2863, %r2382; | |
add.f32 %r2864, %r2864, %r2383; | |
add.f32 %r2865, %r2865, %r2384; | |
add.f32 %r2866, %r2866, %r2385; | |
add.f32 %r2867, %r2867, %r2386; | |
add.f32 %r2868, %r2868, %r2387; | |
add.f32 %r2869, %r2869, %r2388; | |
add.f32 %r2870, %r2870, %r2389; | |
add.f32 %r2871, %r2871, %r2390; | |
add.f32 %r2872, %r2872, %r2391; | |
add.f32 %r2873, %r2873, %r2392; | |
add.f32 %r2874, %r2874, %r2393; | |
add.f32 %r2875, %r2875, %r2394; | |
add.f32 %r2876, %r2876, %r2395; | |
add.f32 %r2877, %r2877, %r2396; | |
add.f32 %r2878, %r2878, %r2397; | |
add.f32 %r2879, %r2879, %r2398; | |
add.f32 %r2880, %r2880, %r2399; | |
add.f32 %r2881, %r2881, %r2400; | |
add.f32 %r2882, %r2882, %r2401; | |
add.f32 %r2883, %r2883, %r2402; | |
add.f32 %r2884, %r2884, %r2403; | |
add.f32 %r2885, %r2885, %r2404; | |
add.f32 %r2886, %r2886, %r2405; | |
add.f32 %r2887, %r2887, %r2406; | |
add.f32 %r2888, %r2888, %r2407; | |
add.f32 %r2889, %r2889, %r2408; | |
add.f32 %r2890, %r2890, %r2409; | |
add.f32 %r2891, %r2891, %r2410; | |
add.f32 %r2892, %r2892, %r2411; | |
add.f32 %r2893, %r2893, %r2412; | |
add.f32 %r2894, %r2894, %r2413; | |
add.f32 %r2895, %r2895, %r2414; | |
add.f32 %r2896, %r2896, %r2415; | |
add.f32 %r2897, %r2897, %r2416; | |
add.f32 %r2898, %r2898, %r2417; | |
add.f32 %r2899, %r2899, %r2418; | |
add.f32 %r2900, %r2900, %r2419; | |
add.f32 %r2901, %r2901, %r2420; | |
add.f32 %r2902, %r2902, %r2421; | |
add.f32 %r2903, %r2903, %r2422; | |
add.f32 %r2904, %r2904, %r2423; | |
add.f32 %r2905, %r2905, %r2424; | |
add.f32 %r2906, %r2906, %r2425; | |
add.f32 %r2907, %r2907, %r2426; | |
add.f32 %r2908, %r2908, %r2427; | |
add.f32 %r2909, %r2909, %r2428; | |
add.f32 %r2910, %r2910, %r2429; | |
add.f32 %r2911, %r2911, %r2430; | |
add.f32 %r2912, %r2912, %r2431; | |
add.f32 %r2913, %r2913, %r2432; | |
add.f32 %r2914, %r2914, %r2433; | |
add.f32 %r2915, %r2915, %r2434; | |
add.f32 %r2916, %r2916, %r2435; | |
add.f32 %r2917, %r2917, %r2436; | |
.loc 1 133 20 // layer_norm.py:133:20 | |
add.s64 %rd167, %rd167, 4; | |
.loc 1 134 20 // layer_norm.py:134:20 | |
add.s64 %rd166, %rd166, 4; | |
.loc 1 115 30 // layer_norm.py:115:30 | |
add.s64 %rd165, %rd165, %rd26; | |
add.s64 %rd164, %rd164, %rd29; | |
add.s64 %rd163, %rd163, %rd30; | |
add.s32 %r2661, %r2661, -1; | |
setp.ne.s32 %p72, %r2661, 0; | |
@%p72 bra $L__BB0_2; | |
$L__BB0_3: // %._crit_edge | |
.loc 1 104 18 // layer_norm.py:104:18 | |
setp.lt.s32 %p88, %r2240, %r542; | |
setp.lt.s32 %p80, %r2241, %r542; | |
setp.lt.s32 %p73, %r2242, %r542; | |
.loc 1 138 37 // layer_norm.py:138:37 | |
mul.lo.s32 %r2238, %r539, %r1; | |
.loc 1 138 22 // layer_norm.py:138:22 | |
mul.wide.s32 %rd154, %r2238, 4; | |
add.s64 %rd155, %rd45, %rd154; | |
.loc 1 138 49 // layer_norm.py:138:49 | |
shl.b64 %rd156, %rd1, 2; | |
add.s64 %rd122, %rd155, %rd156; | |
add.s64 %rd123, %rd122, 2048; | |
add.s64 %rd124, %rd122, 4096; | |
add.s64 %rd125, %rd122, 6144; | |
add.s64 %rd126, %rd122, 8192; | |
add.s64 %rd127, %rd122, 10240; | |
add.s64 %rd128, %rd122, 12288; | |
add.s64 %rd129, %rd155, %rd162; | |
add.s64 %rd130, %rd122, 16384; | |
add.s64 %rd131, %rd122, 18432; | |
add.s64 %rd132, %rd122, 20480; | |
add.s64 %rd133, %rd122, 22528; | |
add.s64 %rd134, %rd122, 24576; | |
add.s64 %rd135, %rd122, 26624; | |
add.s64 %rd136, %rd122, 28672; | |
add.s64 %rd137, %rd155, %rd161; | |
.loc 1 138 55 // layer_norm.py:138:55 | |
// begin inline asm | |
@%p73 st.global.v4.b32 [ %rd122 + 0 ], { %r2790, %r2791, %r2792, %r2793 }; | |
// end inline asm | |
// begin inline asm | |
@%p74 st.global.v4.b32 [ %rd123 + 0 ], { %r2794, %r2795, %r2796, %r2797 }; | |
// end inline asm | |
// begin inline asm | |
@%p75 st.global.v4.b32 [ %rd124 + 0 ], { %r2798, %r2799, %r2800, %r2801 }; | |
// end inline asm | |
// begin inline asm | |
@%p76 st.global.v4.b32 [ %rd125 + 0 ], { %r2802, %r2803, %r2804, %r2805 }; | |
// end inline asm | |
// begin inline asm | |
@%p77 st.global.v4.b32 [ %rd126 + 0 ], { %r2806, %r2807, %r2808, %r2809 }; | |
// end inline asm | |
// begin inline asm | |
@%p78 st.global.v4.b32 [ %rd127 + 0 ], { %r2810, %r2811, %r2812, %r2813 }; | |
// end inline asm | |
// begin inline asm | |
@%p79 st.global.v4.b32 [ %rd128 + 0 ], { %r2814, %r2815, %r2816, %r2817 }; | |
// end inline asm | |
// begin inline asm | |
@%p80 st.global.v4.b32 [ %rd129 + 0 ], { %r2818, %r2819, %r2820, %r2821 }; | |
// end inline asm | |
// begin inline asm | |
@%p81 st.global.v4.b32 [ %rd130 + 0 ], { %r2822, %r2823, %r2824, %r2825 }; | |
// end inline asm | |
// begin inline asm | |
@%p82 st.global.v4.b32 [ %rd131 + 0 ], { %r2826, %r2827, %r2828, %r2829 }; | |
// end inline asm | |
// begin inline asm | |
@%p83 st.global.v4.b32 [ %rd132 + 0 ], { %r2830, %r2831, %r2832, %r2833 }; | |
// end inline asm | |
// begin inline asm | |
@%p84 st.global.v4.b32 [ %rd133 + 0 ], { %r2834, %r2835, %r2836, %r2837 }; | |
// end inline asm | |
// begin inline asm | |
@%p85 st.global.v4.b32 [ %rd134 + 0 ], { %r2838, %r2839, %r2840, %r2841 }; | |
// end inline asm | |
// begin inline asm | |
@%p86 st.global.v4.b32 [ %rd135 + 0 ], { %r2842, %r2843, %r2844, %r2845 }; | |
// end inline asm | |
// begin inline asm | |
@%p87 st.global.v4.b32 [ %rd136 + 0 ], { %r2846, %r2847, %r2848, %r2849 }; | |
// end inline asm | |
// begin inline asm | |
@%p88 st.global.v4.b32 [ %rd137 + 0 ], { %r2850, %r2851, %r2852, %r2853 }; | |
// end inline asm | |
.loc 1 139 37 // layer_norm.py:139:37 | |
mul.lo.s32 %r2239, %r540, %r1; | |
.loc 1 139 22 // layer_norm.py:139:22 | |
mul.wide.s32 %rd159, %r2239, 4; | |
add.s64 %rd160, %rd46, %rd159; | |
.loc 1 139 49 // layer_norm.py:139:49 | |
add.s64 %rd138, %rd160, %rd156; | |
add.s64 %rd139, %rd138, 2048; | |
add.s64 %rd140, %rd138, 4096; | |
add.s64 %rd141, %rd138, 6144; | |
add.s64 %rd142, %rd138, 8192; | |
add.s64 %rd143, %rd138, 10240; | |
add.s64 %rd144, %rd138, 12288; | |
add.s64 %rd145, %rd160, %rd162; | |
add.s64 %rd146, %rd138, 16384; | |
add.s64 %rd147, %rd138, 18432; | |
add.s64 %rd148, %rd138, 20480; | |
add.s64 %rd149, %rd138, 22528; | |
add.s64 %rd150, %rd138, 24576; | |
add.s64 %rd151, %rd138, 26624; | |
add.s64 %rd152, %rd138, 28672; | |
add.s64 %rd153, %rd160, %rd161; | |
.loc 1 139 55 // layer_norm.py:139:55 | |
// begin inline asm | |
@%p73 st.global.v4.b32 [ %rd138 + 0 ], { %r2854, %r2855, %r2856, %r2857 }; | |
// end inline asm | |
// begin inline asm | |
@%p74 st.global.v4.b32 [ %rd139 + 0 ], { %r2858, %r2859, %r2860, %r2861 }; | |
// end inline asm | |
// begin inline asm | |
@%p75 st.global.v4.b32 [ %rd140 + 0 ], { %r2862, %r2863, %r2864, %r2865 }; | |
// end inline asm | |
// begin inline asm | |
@%p76 st.global.v4.b32 [ %rd141 + 0 ], { %r2866, %r2867, %r2868, %r2869 }; | |
// end inline asm | |
// begin inline asm | |
@%p77 st.global.v4.b32 [ %rd142 + 0 ], { %r2870, %r2871, %r2872, %r2873 }; | |
// end inline asm | |
// begin inline asm | |
@%p78 st.global.v4.b32 [ %rd143 + 0 ], { %r2874, %r2875, %r2876, %r2877 }; | |
// end inline asm | |
// begin inline asm | |
@%p79 st.global.v4.b32 [ %rd144 + 0 ], { %r2878, %r2879, %r2880, %r2881 }; | |
// end inline asm | |
// begin inline asm | |
@%p80 st.global.v4.b32 [ %rd145 + 0 ], { %r2882, %r2883, %r2884, %r2885 }; | |
// end inline asm | |
// begin inline asm | |
@%p81 st.global.v4.b32 [ %rd146 + 0 ], { %r2886, %r2887, %r2888, %r2889 }; | |
// end inline asm | |
// begin inline asm | |
@%p82 st.global.v4.b32 [ %rd147 + 0 ], { %r2890, %r2891, %r2892, %r2893 }; | |
// end inline asm | |
// begin inline asm | |
@%p83 st.global.v4.b32 [ %rd148 + 0 ], { %r2894, %r2895, %r2896, %r2897 }; | |
// end inline asm | |
// begin inline asm | |
@%p84 st.global.v4.b32 [ %rd149 + 0 ], { %r2898, %r2899, %r2900, %r2901 }; | |
// end inline asm | |
// begin inline asm | |
@%p85 st.global.v4.b32 [ %rd150 + 0 ], { %r2902, %r2903, %r2904, %r2905 }; | |
// end inline asm | |
// begin inline asm | |
@%p86 st.global.v4.b32 [ %rd151 + 0 ], { %r2906, %r2907, %r2908, %r2909 }; | |
// end inline asm | |
// begin inline asm | |
@%p87 st.global.v4.b32 [ %rd152 + 0 ], { %r2910, %r2911, %r2912, %r2913 }; | |
// end inline asm | |
// begin inline asm | |
@%p88 st.global.v4.b32 [ %rd153 + 0 ], { %r2914, %r2915, %r2916, %r2917 }; | |
// end inline asm | |
.loc 1 139 4 // layer_norm.py:139:4 | |
ret; | |
$L__tmp7: | |
$L__func_end0: | |
// -- End function | |
} | |
.file 1 "/home/dberard/local/pytorch-env7/Liger-Kernel/src/liger_kernel/ops/layer_norm.py" | |
.file 2 "/home/dberard/local/pytorch-env7/triton/python/triton/language/standard.py" | |
.section .debug_abbrev | |
{ | |
.b8 1 // Abbreviation Code | |
.b8 17 // DW_TAG_compile_unit | |
.b8 1 // DW_CHILDREN_yes | |
.b8 37 // DW_AT_producer | |
.b8 8 // DW_FORM_string | |
.b8 19 // DW_AT_language | |
.b8 5 // DW_FORM_data2 | |
.b8 3 // DW_AT_name | |
.b8 8 // DW_FORM_string | |
.b8 16 // DW_AT_stmt_list | |
.b8 6 // DW_FORM_data4 | |
.b8 27 // DW_AT_comp_dir | |
.b8 8 // DW_FORM_string | |
.b8 0 // EOM(1) | |
.b8 0 // EOM(2) | |
.b8 2 // Abbreviation Code | |
.b8 46 // DW_TAG_subprogram | |
.b8 0 // DW_CHILDREN_no | |
.b8 3 // DW_AT_name | |
.b8 8 // DW_FORM_string | |
.b8 32 // DW_AT_inline | |
.b8 11 // DW_FORM_data1 | |
.b8 0 // EOM(1) | |
.b8 0 // EOM(2) | |
.b8 3 // Abbreviation Code | |
.b8 46 // DW_TAG_subprogram | |
.b8 1 // DW_CHILDREN_yes | |
.b8 17 // DW_AT_low_pc | |
.b8 1 // DW_FORM_addr | |
.b8 18 // DW_AT_high_pc | |
.b8 1 // DW_FORM_addr | |
.b8 49 // DW_AT_abstract_origin | |
.b8 19 // DW_FORM_ref4 | |
.b8 0 // EOM(1) | |
.b8 0 // EOM(2) | |
.b8 4 // Abbreviation Code | |
.b8 29 // DW_TAG_inlined_subroutine | |
.b8 0 // DW_CHILDREN_no | |
.b8 49 // DW_AT_abstract_origin | |
.b8 19 // DW_FORM_ref4 | |
.b8 17 // DW_AT_low_pc | |
.b8 1 // DW_FORM_addr | |
.b8 18 // DW_AT_high_pc | |
.b8 1 // DW_FORM_addr | |
.b8 88 // DW_AT_call_file | |
.b8 11 // DW_FORM_data1 | |
.b8 89 // DW_AT_call_line | |
.b8 11 // DW_FORM_data1 | |
.b8 87 // DW_AT_call_column | |
.b8 11 // DW_FORM_data1 | |
.b8 0 // EOM(1) | |
.b8 0 // EOM(2) | |
.b8 0 // EOM(3) | |
} | |
.section .debug_info | |
{ | |
.b32 203 // Length of Unit | |
.b8 2 // DWARF version number | |
.b8 0 | |
.b32 .debug_abbrev // Offset Into Abbrev. Section | |
.b8 8 // Address Size (in bytes) | |
.b8 1 // Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit | |
.b8 116 // DW_AT_producer | |
.b8 114 | |
.b8 105 | |
.b8 116 | |
.b8 111 | |
.b8 110 | |
.b8 0 | |
.b8 2 // DW_AT_language | |
.b8 0 | |
.b8 108 // DW_AT_name | |
.b8 97 | |
.b8 121 | |
.b8 101 | |
.b8 114 | |
.b8 95 | |
.b8 110 | |
.b8 111 | |
.b8 114 | |
.b8 109 | |
.b8 46 | |
.b8 112 | |
.b8 121 | |
.b8 0 | |
.b32 .debug_line // DW_AT_stmt_list | |
.b8 47 // DW_AT_comp_dir | |
.b8 104 | |
.b8 111 | |
.b8 109 | |
.b8 101 | |
.b8 47 | |
.b8 100 | |
.b8 98 | |
.b8 101 | |
.b8 114 | |
.b8 97 | |
.b8 114 | |
.b8 100 | |
.b8 47 | |
.b8 108 | |
.b8 111 | |
.b8 99 | |
.b8 97 | |
.b8 108 | |
.b8 47 | |
.b8 112 | |
.b8 121 | |
.b8 116 | |
.b8 111 | |
.b8 114 | |
.b8 99 | |
.b8 104 | |
.b8 45 | |
.b8 101 | |
.b8 110 | |
.b8 118 | |
.b8 55 | |
.b8 47 | |
.b8 76 | |
.b8 105 | |
.b8 103 | |
.b8 101 | |
.b8 114 | |
.b8 45 | |
.b8 75 | |
.b8 101 | |
.b8 114 | |
.b8 110 | |
.b8 101 | |
.b8 108 | |
.b8 47 | |
.b8 115 | |
.b8 114 | |
.b8 99 | |
.b8 47 | |
.b8 108 | |
.b8 105 | |
.b8 103 | |
.b8 101 | |
.b8 114 | |
.b8 95 | |
.b8 107 | |
.b8 101 | |
.b8 114 | |
.b8 110 | |
.b8 101 | |
.b8 108 | |
.b8 47 | |
.b8 111 | |
.b8 112 | |
.b8 115 | |
.b8 0 | |
.b8 2 // Abbrev [2] 0x6a:0x1e DW_TAG_subprogram | |
.b8 95 // DW_AT_name | |
.b8 108 | |
.b8 97 | |
.b8 121 | |
.b8 101 | |
.b8 114 | |
.b8 95 | |
.b8 110 | |
.b8 111 | |
.b8 114 | |
.b8 109 | |
.b8 95 | |
.b8 98 | |
.b8 97 | |
.b8 99 | |
.b8 107 | |
.b8 119 | |
.b8 97 | |
.b8 114 | |
.b8 100 | |
.b8 95 | |
.b8 107 | |
.b8 101 | |
.b8 114 | |
.b8 110 | |
.b8 101 | |
.b8 108 | |
.b8 0 | |
.b8 1 // DW_AT_inline | |
.b8 3 // Abbrev [3] 0x88:0x46 DW_TAG_subprogram | |
.b64 $L__func_begin0 // DW_AT_low_pc | |
.b64 $L__func_end0 // DW_AT_high_pc | |
.b32 106 // DW_AT_abstract_origin | |
.b8 4 // Abbrev [4] 0x9d:0x18 DW_TAG_inlined_subroutine | |
.b32 106 // DW_AT_abstract_origin | |
.b64 $L__tmp1 // DW_AT_low_pc | |
.b64 $L__tmp4 // DW_AT_high_pc | |
.b8 1 // DW_AT_call_file | |
.b8 124 // DW_AT_call_line | |
.b8 20 // DW_AT_call_column | |
.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine | |
.b32 106 // DW_AT_abstract_origin | |
.b64 $L__tmp5 // DW_AT_low_pc | |
.b64 $L__tmp6 // DW_AT_high_pc | |
.b8 1 // DW_AT_call_file | |
.b8 125 // DW_AT_call_line | |
.b8 20 // DW_AT_call_column | |
.b8 0 // End Of Children Mark | |
.b8 0 // End Of Children Mark | |
} | |
.section .debug_macinfo { } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment