Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Last active July 9, 2025 23:42
Show Gist options
  • Save davidberard98/c316464b8c193cf45b8c779a443bafef to your computer and use it in GitHub Desktop.
Save davidberard98/c316464b8c193cf45b8c779a443bafef to your computer and use it in GitHub Desktop.
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_90a
.address_size 64
// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
.extern .shared .align 16 .b8 global_smem[];
// @_layer_norm_backward_kernel
.visible .entry _layer_norm_backward_kernel(
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_0,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_1,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_2,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_3,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_4,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_5,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_6,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_7,
.param .u32 _layer_norm_backward_kernel_param_8,
.param .u32 _layer_norm_backward_kernel_param_9,
.param .u32 _layer_norm_backward_kernel_param_10,
.param .u32 _layer_norm_backward_kernel_param_11,
.param .u32 _layer_norm_backward_kernel_param_12,
.param .u32 _layer_norm_backward_kernel_param_13,
.param .u32 _layer_norm_backward_kernel_param_14,
.param .u64 .ptr .global .align 1 _layer_norm_backward_kernel_param_15
)
.reqntid 128
{
.reg .pred %p<118>;
.reg .b32 %r<2918>;
.reg .b64 %rd<168>;
.loc 1 73 0 // layer_norm.py:73:0
$L__func_begin0:
.loc 1 73 0 // layer_norm.py:73:0
// %bb.0:
ld.param.b32 %r542, [_layer_norm_backward_kernel_param_14];
ld.param.b32 %r540, [_layer_norm_backward_kernel_param_11];
ld.param.b32 %r539, [_layer_norm_backward_kernel_param_10];
ld.param.b64 %rd46, [_layer_norm_backward_kernel_param_6];
ld.param.b64 %rd45, [_layer_norm_backward_kernel_param_5];
$L__tmp0:
.loc 1 100 33 // layer_norm.py:100:33
mov.u32 %r1, %ctaid.x;
.loc 1 101 31 // layer_norm.py:101:31
shl.b32 %r2, %r1, 5;
.loc 1 102 39 // layer_norm.py:102:39
add.s32 %r671, %r2, 32;
ld.param.b32 %r672, [_layer_norm_backward_kernel_param_13];
.loc 1 102 57 // layer_norm.py:102:57
min.s32 %r3, %r671, %r672;
.loc 1 103 24 // layer_norm.py:103:24
mov.u32 %r4, %tid.x;
shl.b32 %r673, %r4, 2;
and.b32 %r674, %r673, 508;
or.b32 %r5, %r674, 512;
or.b32 %r6, %r674, 1024;
or.b32 %r7, %r674, 1536;
or.b32 %r8, %r674, 2048;
or.b32 %r9, %r674, 2560;
or.b32 %r10, %r674, 3072;
or.b32 %r675, %r673, 3584;
or.b32 %r11, %r674, 4096;
or.b32 %r12, %r674, 4608;
or.b32 %r13, %r674, 5120;
or.b32 %r14, %r674, 5632;
or.b32 %r15, %r674, 6144;
or.b32 %r16, %r674, 6656;
or.b32 %r17, %r674, 7168;
or.b32 %r676, %r673, 7680;
.loc 1 117 28 // layer_norm.py:117:28
cvt.u64.u32 %rd1, %r674;
cvt.u64.u32 %rd9, %r675;
cvt.u64.u32 %rd18, %r676;
.loc 1 115 30 // layer_norm.py:115:30
setp.le.s32 %p1, %r3, %r2;
mov.b32 %r2790, 0;
shl.b64 %rd161, %rd18, 2;
shl.b64 %rd162, %rd9, 2;
setp.lt.s32 %p87, %r17, %r542;
cvt.u32.u64 %r2240, %rd18;
cvt.u32.u64 %r2241, %rd9;
cvt.u32.u64 %r2242, %rd1;
setp.lt.s32 %p86, %r16, %r542;
setp.lt.s32 %p85, %r15, %r542;
setp.lt.s32 %p84, %r14, %r542;
setp.lt.s32 %p83, %r13, %r542;
setp.lt.s32 %p82, %r12, %r542;
setp.lt.s32 %p81, %r11, %r542;
setp.lt.s32 %p79, %r10, %r542;
setp.lt.s32 %p78, %r9, %r542;
setp.lt.s32 %p77, %r8, %r542;
setp.lt.s32 %p76, %r7, %r542;
setp.lt.s32 %p75, %r6, %r542;
setp.lt.s32 %p74, %r5, %r542;
mov.b32 %r2791, %r2790;
mov.b32 %r2792, %r2790;
mov.b32 %r2793, %r2790;
mov.b32 %r2794, %r2790;
mov.b32 %r2795, %r2790;
mov.b32 %r2796, %r2790;
mov.b32 %r2797, %r2790;
mov.b32 %r2798, %r2790;
mov.b32 %r2799, %r2790;
mov.b32 %r2800, %r2790;
mov.b32 %r2801, %r2790;
mov.b32 %r2802, %r2790;
mov.b32 %r2803, %r2790;
mov.b32 %r2804, %r2790;
mov.b32 %r2805, %r2790;
mov.b32 %r2806, %r2790;
mov.b32 %r2807, %r2790;
mov.b32 %r2808, %r2790;
mov.b32 %r2809, %r2790;
mov.b32 %r2810, %r2790;
mov.b32 %r2811, %r2790;
mov.b32 %r2812, %r2790;
mov.b32 %r2813, %r2790;
mov.b32 %r2814, %r2790;
mov.b32 %r2815, %r2790;
mov.b32 %r2816, %r2790;
mov.b32 %r2817, %r2790;
mov.b32 %r2818, %r2790;
mov.b32 %r2819, %r2790;
mov.b32 %r2820, %r2790;
mov.b32 %r2821, %r2790;
mov.b32 %r2822, %r2790;
mov.b32 %r2823, %r2790;
mov.b32 %r2824, %r2790;
mov.b32 %r2825, %r2790;
mov.b32 %r2826, %r2790;
mov.b32 %r2827, %r2790;
mov.b32 %r2828, %r2790;
mov.b32 %r2829, %r2790;
mov.b32 %r2830, %r2790;
mov.b32 %r2831, %r2790;
mov.b32 %r2832, %r2790;
mov.b32 %r2833, %r2790;
mov.b32 %r2834, %r2790;
mov.b32 %r2835, %r2790;
mov.b32 %r2836, %r2790;
mov.b32 %r2837, %r2790;
mov.b32 %r2838, %r2790;
mov.b32 %r2839, %r2790;
mov.b32 %r2840, %r2790;
mov.b32 %r2841, %r2790;
mov.b32 %r2842, %r2790;
mov.b32 %r2843, %r2790;
mov.b32 %r2844, %r2790;
mov.b32 %r2845, %r2790;
mov.b32 %r2846, %r2790;
mov.b32 %r2847, %r2790;
mov.b32 %r2848, %r2790;
mov.b32 %r2849, %r2790;
mov.b32 %r2850, %r2790;
mov.b32 %r2851, %r2790;
mov.b32 %r2852, %r2790;
mov.b32 %r2853, %r2790;
mov.b32 %r2854, %r2790;
mov.b32 %r2855, %r2790;
mov.b32 %r2856, %r2790;
mov.b32 %r2857, %r2790;
mov.b32 %r2858, %r2790;
mov.b32 %r2859, %r2790;
mov.b32 %r2860, %r2790;
mov.b32 %r2861, %r2790;
mov.b32 %r2862, %r2790;
mov.b32 %r2863, %r2790;
mov.b32 %r2864, %r2790;
mov.b32 %r2865, %r2790;
mov.b32 %r2866, %r2790;
mov.b32 %r2867, %r2790;
mov.b32 %r2868, %r2790;
mov.b32 %r2869, %r2790;
mov.b32 %r2870, %r2790;
mov.b32 %r2871, %r2790;
mov.b32 %r2872, %r2790;
mov.b32 %r2873, %r2790;
mov.b32 %r2874, %r2790;
mov.b32 %r2875, %r2790;
mov.b32 %r2876, %r2790;
mov.b32 %r2877, %r2790;
mov.b32 %r2878, %r2790;
mov.b32 %r2879, %r2790;
mov.b32 %r2880, %r2790;
mov.b32 %r2881, %r2790;
mov.b32 %r2882, %r2790;
mov.b32 %r2883, %r2790;
mov.b32 %r2884, %r2790;
mov.b32 %r2885, %r2790;
mov.b32 %r2886, %r2790;
mov.b32 %r2887, %r2790;
mov.b32 %r2888, %r2790;
mov.b32 %r2889, %r2790;
mov.b32 %r2890, %r2790;
mov.b32 %r2891, %r2790;
mov.b32 %r2892, %r2790;
mov.b32 %r2893, %r2790;
mov.b32 %r2894, %r2790;
mov.b32 %r2895, %r2790;
mov.b32 %r2896, %r2790;
mov.b32 %r2897, %r2790;
mov.b32 %r2898, %r2790;
mov.b32 %r2899, %r2790;
mov.b32 %r2900, %r2790;
mov.b32 %r2901, %r2790;
mov.b32 %r2902, %r2790;
mov.b32 %r2903, %r2790;
mov.b32 %r2904, %r2790;
mov.b32 %r2905, %r2790;
mov.b32 %r2906, %r2790;
mov.b32 %r2907, %r2790;
mov.b32 %r2908, %r2790;
mov.b32 %r2909, %r2790;
mov.b32 %r2910, %r2790;
mov.b32 %r2911, %r2790;
mov.b32 %r2912, %r2790;
mov.b32 %r2913, %r2790;
mov.b32 %r2914, %r2790;
mov.b32 %r2915, %r2790;
mov.b32 %r2916, %r2790;
mov.b32 %r2917, %r2790;
@%p1 bra $L__BB0_3;
// %bb.1: // %.lr.ph
.loc 1 0 30 // layer_norm.py:0:30
ld.param.b32 %r541, [_layer_norm_backward_kernel_param_12];
ld.param.b32 %r538, [_layer_norm_backward_kernel_param_9];
ld.param.b32 %r537, [_layer_norm_backward_kernel_param_8];
ld.param.b64 %rd47, [_layer_norm_backward_kernel_param_7];
ld.param.b64 %rd44, [_layer_norm_backward_kernel_param_4];
ld.param.b64 %rd43, [_layer_norm_backward_kernel_param_3];
ld.param.b64 %rd42, [_layer_norm_backward_kernel_param_2];
ld.param.b64 %rd41, [_layer_norm_backward_kernel_param_0];
ld.param.b64 %rd48, [_layer_norm_backward_kernel_param_1];
mul.wide.u32 %rd49, %r674, 4;
add.s64 %rd72, %rd48, %rd49;
add.s64 %rd73, %rd72, 2048;
add.s64 %rd74, %rd72, 4096;
add.s64 %rd75, %rd72, 6144;
add.s64 %rd76, %rd72, 8192;
add.s64 %rd77, %rd72, 10240;
add.s64 %rd78, %rd72, 12288;
mul.wide.u32 %rd50, %r675, 4;
add.s64 %rd79, %rd48, %rd50;
add.s64 %rd80, %rd72, 16384;
add.s64 %rd81, %rd72, 18432;
add.s64 %rd82, %rd72, 20480;
add.s64 %rd83, %rd72, 22528;
add.s64 %rd84, %rd72, 24576;
add.s64 %rd85, %rd72, 26624;
add.s64 %rd86, %rd72, 28672;
mul.wide.u32 %rd51, %r676, 4;
add.s64 %rd87, %rd48, %rd51;
cvt.rn.f32.s32 %r18, %r542;
.loc 1 113 26 // layer_norm.py:113:26
mul.lo.s32 %r805, %r541, %r2;
.loc 1 113 14 // layer_norm.py:113:14
mul.wide.s32 %rd52, %r805, 4;
add.s64 %rd165, %rd47, %rd52;
.loc 1 112 26 // layer_norm.py:112:26
mul.lo.s32 %r806, %r538, %r2;
.loc 1 112 14 // layer_norm.py:112:14
mul.wide.s32 %rd53, %r806, 4;
add.s64 %rd164, %rd44, %rd53;
.loc 1 111 16 // layer_norm.py:111:16
mul.wide.s32 %rd54, %r2, 4;
add.s64 %rd166, %rd43, %rd54;
.loc 1 110 16 // layer_norm.py:110:16
add.s64 %rd167, %rd42, %rd54;
.loc 1 109 25 // layer_norm.py:109:25
mul.lo.s32 %r807, %r537, %r2;
.loc 1 109 13 // layer_norm.py:109:13
mul.wide.s32 %rd55, %r807, 4;
add.s64 %rd163, %rd41, %rd55;
.loc 1 103 24 // layer_norm.py:103:24
and.b32 %r19, %r4, 31;
shr.u32 %r808, %r4, 3;
and.b32 %r809, %r808, 12;
mov.b32 %r810, global_smem;
add.s32 %r1199, %r810, %r809;
add.s32 %r1208, %r810, %r673;
.loc 1 115 30 // layer_norm.py:115:30
mul.wide.s32 %rd26, %r541, 4;
and.b32 %r812, %r4, 127;
mul.wide.u32 %rd28, %r812, 16;
mul.wide.s32 %rd29, %r538, 4;
mul.wide.s32 %rd30, %r537, 4;
sub.s32 %r2661, %r3, %r2;
mov.b32 %r2790, 0f00000000;
setp.eq.s32 %p52, %r4, 0;
setp.lt.u32 %p51, %r4, 4;
setp.eq.s32 %p50, %r19, 0;
mov.b32 %r2791, %r2790;
mov.b32 %r2792, %r2790;
mov.b32 %r2793, %r2790;
mov.b32 %r2794, %r2790;
mov.b32 %r2795, %r2790;
mov.b32 %r2796, %r2790;
mov.b32 %r2797, %r2790;
mov.b32 %r2798, %r2790;
mov.b32 %r2799, %r2790;
mov.b32 %r2800, %r2790;
mov.b32 %r2801, %r2790;
mov.b32 %r2802, %r2790;
mov.b32 %r2803, %r2790;
mov.b32 %r2804, %r2790;
mov.b32 %r2805, %r2790;
mov.b32 %r2806, %r2790;
mov.b32 %r2807, %r2790;
mov.b32 %r2808, %r2790;
mov.b32 %r2809, %r2790;
mov.b32 %r2810, %r2790;
mov.b32 %r2811, %r2790;
mov.b32 %r2812, %r2790;
mov.b32 %r2813, %r2790;
mov.b32 %r2814, %r2790;
mov.b32 %r2815, %r2790;
mov.b32 %r2816, %r2790;
mov.b32 %r2817, %r2790;
mov.b32 %r2818, %r2790;
mov.b32 %r2819, %r2790;
mov.b32 %r2820, %r2790;
mov.b32 %r2821, %r2790;
mov.b32 %r2822, %r2790;
mov.b32 %r2823, %r2790;
mov.b32 %r2824, %r2790;
mov.b32 %r2825, %r2790;
mov.b32 %r2826, %r2790;
mov.b32 %r2827, %r2790;
mov.b32 %r2828, %r2790;
mov.b32 %r2829, %r2790;
mov.b32 %r2830, %r2790;
mov.b32 %r2831, %r2790;
mov.b32 %r2832, %r2790;
mov.b32 %r2833, %r2790;
mov.b32 %r2834, %r2790;
mov.b32 %r2835, %r2790;
mov.b32 %r2836, %r2790;
mov.b32 %r2837, %r2790;
mov.b32 %r2838, %r2790;
mov.b32 %r2839, %r2790;
mov.b32 %r2840, %r2790;
mov.b32 %r2841, %r2790;
mov.b32 %r2842, %r2790;
mov.b32 %r2843, %r2790;
mov.b32 %r2844, %r2790;
mov.b32 %r2845, %r2790;
mov.b32 %r2846, %r2790;
mov.b32 %r2847, %r2790;
mov.b32 %r2848, %r2790;
mov.b32 %r2849, %r2790;
mov.b32 %r2850, %r2790;
mov.b32 %r2851, %r2790;
mov.b32 %r2852, %r2790;
mov.b32 %r2853, %r2790;
mov.b32 %r2854, %r2790;
mov.b32 %r2855, %r2790;
mov.b32 %r2856, %r2790;
mov.b32 %r2857, %r2790;
mov.b32 %r2858, %r2790;
mov.b32 %r2859, %r2790;
mov.b32 %r2860, %r2790;
mov.b32 %r2861, %r2790;
mov.b32 %r2862, %r2790;
mov.b32 %r2863, %r2790;
mov.b32 %r2864, %r2790;
mov.b32 %r2865, %r2790;
mov.b32 %r2866, %r2790;
mov.b32 %r2867, %r2790;
mov.b32 %r2868, %r2790;
mov.b32 %r2869, %r2790;
mov.b32 %r2870, %r2790;
mov.b32 %r2871, %r2790;
mov.b32 %r2872, %r2790;
mov.b32 %r2873, %r2790;
mov.b32 %r2874, %r2790;
mov.b32 %r2875, %r2790;
mov.b32 %r2876, %r2790;
mov.b32 %r2877, %r2790;
mov.b32 %r2878, %r2790;
mov.b32 %r2879, %r2790;
mov.b32 %r2880, %r2790;
mov.b32 %r2881, %r2790;
mov.b32 %r2882, %r2790;
mov.b32 %r2883, %r2790;
mov.b32 %r2884, %r2790;
mov.b32 %r2885, %r2790;
mov.b32 %r2886, %r2790;
mov.b32 %r2887, %r2790;
mov.b32 %r2888, %r2790;
mov.b32 %r2889, %r2790;
mov.b32 %r2890, %r2790;
mov.b32 %r2891, %r2790;
mov.b32 %r2892, %r2790;
mov.b32 %r2893, %r2790;
mov.b32 %r2894, %r2790;
mov.b32 %r2895, %r2790;
mov.b32 %r2896, %r2790;
mov.b32 %r2897, %r2790;
mov.b32 %r2898, %r2790;
mov.b32 %r2899, %r2790;
mov.b32 %r2900, %r2790;
mov.b32 %r2901, %r2790;
mov.b32 %r2902, %r2790;
mov.b32 %r2903, %r2790;
mov.b32 %r2904, %r2790;
mov.b32 %r2905, %r2790;
mov.b32 %r2906, %r2790;
mov.b32 %r2907, %r2790;
mov.b32 %r2908, %r2790;
mov.b32 %r2909, %r2790;
mov.b32 %r2910, %r2790;
mov.b32 %r2911, %r2790;
mov.b32 %r2912, %r2790;
mov.b32 %r2913, %r2790;
mov.b32 %r2914, %r2790;
mov.b32 %r2915, %r2790;
mov.b32 %r2916, %r2790;
mov.b32 %r2917, %r2790;
$L__BB0_2: // =>This Inner Loop Header: Depth=1
.loc 1 104 18 // layer_norm.py:104:18
setp.lt.s32 %p17, %r2240, %r542;
setp.lt.s32 %p9, %r2241, %r542;
setp.lt.s32 %p2, %r2242, %r542;
.loc 1 116 28 // layer_norm.py:116:28
add.s64 %rd56, %rd163, %rd28;
add.s64 %rd57, %rd56, 2048;
add.s64 %rd58, %rd56, 4096;
add.s64 %rd59, %rd56, 6144;
add.s64 %rd60, %rd56, 8192;
add.s64 %rd61, %rd56, 10240;
add.s64 %rd62, %rd56, 12288;
add.s64 %rd63, %rd163, %rd162;
add.s64 %rd64, %rd56, 16384;
add.s64 %rd65, %rd56, 18432;
add.s64 %rd66, %rd56, 20480;
add.s64 %rd67, %rd56, 22528;
add.s64 %rd68, %rd56, 24576;
add.s64 %rd69, %rd56, 26624;
add.s64 %rd70, %rd56, 28672;
.loc 1 116 20 // layer_norm.py:116:20
add.s64 %rd71, %rd163, %rd161;
mov.b32 %r817, 0;
// begin inline asm
mov.u32 %r2370, %r817;
mov.u32 %r2371, %r817;
mov.u32 %r2369, %r817;
mov.u32 %r2368, %r817;
@%p2 ld.global.v4.b32 { %r2370, %r2371, %r2369, %r2368 }, [ %rd56 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2367, %r817;
mov.u32 %r2366, %r817;
mov.u32 %r2365, %r817;
mov.u32 %r2364, %r817;
@%p74 ld.global.v4.b32 { %r2367, %r2366, %r2365, %r2364 }, [ %rd57 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2363, %r817;
mov.u32 %r2362, %r817;
mov.u32 %r2361, %r817;
mov.u32 %r2360, %r817;
@%p75 ld.global.v4.b32 { %r2363, %r2362, %r2361, %r2360 }, [ %rd58 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2359, %r817;
mov.u32 %r2358, %r817;
mov.u32 %r2357, %r817;
mov.u32 %r2356, %r817;
@%p76 ld.global.v4.b32 { %r2359, %r2358, %r2357, %r2356 }, [ %rd59 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2355, %r817;
mov.u32 %r2354, %r817;
mov.u32 %r2353, %r817;
mov.u32 %r2352, %r817;
@%p77 ld.global.v4.b32 { %r2355, %r2354, %r2353, %r2352 }, [ %rd60 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2351, %r817;
mov.u32 %r2350, %r817;
mov.u32 %r2349, %r817;
mov.u32 %r2348, %r817;
@%p78 ld.global.v4.b32 { %r2351, %r2350, %r2349, %r2348 }, [ %rd61 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2347, %r817;
mov.u32 %r2346, %r817;
mov.u32 %r2345, %r817;
mov.u32 %r2344, %r817;
@%p79 ld.global.v4.b32 { %r2347, %r2346, %r2345, %r2344 }, [ %rd62 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2343, %r817;
mov.u32 %r2342, %r817;
mov.u32 %r2341, %r817;
mov.u32 %r2340, %r817;
@%p9 ld.global.v4.b32 { %r2343, %r2342, %r2341, %r2340 }, [ %rd63 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2339, %r817;
mov.u32 %r2338, %r817;
mov.u32 %r2337, %r817;
mov.u32 %r2336, %r817;
@%p81 ld.global.v4.b32 { %r2339, %r2338, %r2337, %r2336 }, [ %rd64 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2335, %r817;
mov.u32 %r2334, %r817;
mov.u32 %r2333, %r817;
mov.u32 %r2332, %r817;
@%p82 ld.global.v4.b32 { %r2335, %r2334, %r2333, %r2332 }, [ %rd65 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2331, %r817;
mov.u32 %r2330, %r817;
mov.u32 %r2329, %r817;
mov.u32 %r2328, %r817;
@%p83 ld.global.v4.b32 { %r2331, %r2330, %r2329, %r2328 }, [ %rd66 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2327, %r817;
mov.u32 %r2326, %r817;
mov.u32 %r2325, %r817;
mov.u32 %r2324, %r817;
@%p84 ld.global.v4.b32 { %r2327, %r2326, %r2325, %r2324 }, [ %rd67 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2323, %r817;
mov.u32 %r2322, %r817;
mov.u32 %r2321, %r817;
mov.u32 %r2320, %r817;
@%p85 ld.global.v4.b32 { %r2323, %r2322, %r2321, %r2320 }, [ %rd68 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2319, %r817;
mov.u32 %r2318, %r817;
mov.u32 %r2317, %r817;
mov.u32 %r2316, %r817;
@%p86 ld.global.v4.b32 { %r2319, %r2318, %r2317, %r2316 }, [ %rd69 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2315, %r817;
mov.u32 %r2314, %r817;
mov.u32 %r2313, %r817;
mov.u32 %r2312, %r817;
@%p87 ld.global.v4.b32 { %r2315, %r2314, %r2313, %r2312 }, [ %rd70 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2311, %r817;
mov.u32 %r2310, %r817;
mov.u32 %r2309, %r817;
mov.u32 %r2308, %r817;
@%p17 ld.global.v4.b32 { %r2311, %r2310, %r2309, %r2308 }, [ %rd71 + 0 ];
// end inline asm
.loc 1 117 20 // layer_norm.py:117:20
// begin inline asm
mov.u32 %r2243, %r817;
mov.u32 %r2244, %r817;
mov.u32 %r2245, %r817;
mov.u32 %r2246, %r817;
@%p2 ld.global.v4.b32 { %r2243, %r2244, %r2245, %r2246 }, [ %rd72 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2247, %r817;
mov.u32 %r2248, %r817;
mov.u32 %r2249, %r817;
mov.u32 %r2250, %r817;
@%p74 ld.global.v4.b32 { %r2247, %r2248, %r2249, %r2250 }, [ %rd73 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2251, %r817;
mov.u32 %r2252, %r817;
mov.u32 %r2253, %r817;
mov.u32 %r2254, %r817;
@%p75 ld.global.v4.b32 { %r2251, %r2252, %r2253, %r2254 }, [ %rd74 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2255, %r817;
mov.u32 %r2256, %r817;
mov.u32 %r2257, %r817;
mov.u32 %r2258, %r817;
@%p76 ld.global.v4.b32 { %r2255, %r2256, %r2257, %r2258 }, [ %rd75 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2259, %r817;
mov.u32 %r2260, %r817;
mov.u32 %r2261, %r817;
mov.u32 %r2262, %r817;
@%p77 ld.global.v4.b32 { %r2259, %r2260, %r2261, %r2262 }, [ %rd76 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2263, %r817;
mov.u32 %r2264, %r817;
mov.u32 %r2265, %r817;
mov.u32 %r2266, %r817;
@%p78 ld.global.v4.b32 { %r2263, %r2264, %r2265, %r2266 }, [ %rd77 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2267, %r817;
mov.u32 %r2268, %r817;
mov.u32 %r2269, %r817;
mov.u32 %r2270, %r817;
@%p79 ld.global.v4.b32 { %r2267, %r2268, %r2269, %r2270 }, [ %rd78 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2271, %r817;
mov.u32 %r2272, %r817;
mov.u32 %r2273, %r817;
mov.u32 %r2274, %r817;
@%p9 ld.global.v4.b32 { %r2271, %r2272, %r2273, %r2274 }, [ %rd79 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2275, %r817;
mov.u32 %r2276, %r817;
mov.u32 %r2277, %r817;
mov.u32 %r2278, %r817;
@%p81 ld.global.v4.b32 { %r2275, %r2276, %r2277, %r2278 }, [ %rd80 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2279, %r817;
mov.u32 %r2280, %r817;
mov.u32 %r2281, %r817;
mov.u32 %r2282, %r817;
@%p82 ld.global.v4.b32 { %r2279, %r2280, %r2281, %r2282 }, [ %rd81 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2283, %r817;
mov.u32 %r2284, %r817;
mov.u32 %r2285, %r817;
mov.u32 %r2286, %r817;
@%p83 ld.global.v4.b32 { %r2283, %r2284, %r2285, %r2286 }, [ %rd82 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2287, %r817;
mov.u32 %r2288, %r817;
mov.u32 %r2289, %r817;
mov.u32 %r2290, %r817;
@%p84 ld.global.v4.b32 { %r2287, %r2288, %r2289, %r2290 }, [ %rd83 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2291, %r817;
mov.u32 %r2292, %r817;
mov.u32 %r2293, %r817;
mov.u32 %r2294, %r817;
@%p85 ld.global.v4.b32 { %r2291, %r2292, %r2293, %r2294 }, [ %rd84 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2295, %r817;
mov.u32 %r2296, %r817;
mov.u32 %r2297, %r817;
mov.u32 %r2298, %r817;
@%p86 ld.global.v4.b32 { %r2295, %r2296, %r2297, %r2298 }, [ %rd85 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2299, %r817;
mov.u32 %r2300, %r817;
mov.u32 %r2301, %r817;
mov.u32 %r2302, %r817;
@%p87 ld.global.v4.b32 { %r2299, %r2300, %r2301, %r2302 }, [ %rd86 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2303, %r817;
mov.u32 %r2304, %r817;
mov.u32 %r2305, %r817;
mov.u32 %r2306, %r817;
@%p17 ld.global.v4.b32 { %r2303, %r2304, %r2305, %r2306 }, [ %rd87 + 0 ];
// end inline asm
.loc 1 118 30 // layer_norm.py:118:30
add.s64 %rd88, %rd165, %rd28;
add.s64 %rd89, %rd88, 2048;
add.s64 %rd90, %rd88, 4096;
add.s64 %rd91, %rd88, 6144;
add.s64 %rd92, %rd88, 8192;
add.s64 %rd93, %rd88, 10240;
add.s64 %rd94, %rd88, 12288;
add.s64 %rd95, %rd165, %rd162;
add.s64 %rd96, %rd88, 16384;
add.s64 %rd97, %rd88, 18432;
add.s64 %rd98, %rd88, 20480;
add.s64 %rd99, %rd88, 22528;
add.s64 %rd100, %rd88, 24576;
add.s64 %rd101, %rd88, 26624;
add.s64 %rd102, %rd88, 28672;
.loc 1 118 21 // layer_norm.py:118:21
add.s64 %rd103, %rd165, %rd161;
// begin inline asm
mov.u32 %r2373, %r817;
mov.u32 %r2374, %r817;
mov.u32 %r2375, %r817;
mov.u32 %r2376, %r817;
@%p2 ld.global.v4.b32 { %r2373, %r2374, %r2375, %r2376 }, [ %rd88 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2377, %r817;
mov.u32 %r2378, %r817;
mov.u32 %r2379, %r817;
mov.u32 %r2380, %r817;
@%p74 ld.global.v4.b32 { %r2377, %r2378, %r2379, %r2380 }, [ %rd89 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2381, %r817;
mov.u32 %r2382, %r817;
mov.u32 %r2383, %r817;
mov.u32 %r2384, %r817;
@%p75 ld.global.v4.b32 { %r2381, %r2382, %r2383, %r2384 }, [ %rd90 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2385, %r817;
mov.u32 %r2386, %r817;
mov.u32 %r2387, %r817;
mov.u32 %r2388, %r817;
@%p76 ld.global.v4.b32 { %r2385, %r2386, %r2387, %r2388 }, [ %rd91 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2389, %r817;
mov.u32 %r2390, %r817;
mov.u32 %r2391, %r817;
mov.u32 %r2392, %r817;
@%p77 ld.global.v4.b32 { %r2389, %r2390, %r2391, %r2392 }, [ %rd92 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2393, %r817;
mov.u32 %r2394, %r817;
mov.u32 %r2395, %r817;
mov.u32 %r2396, %r817;
@%p78 ld.global.v4.b32 { %r2393, %r2394, %r2395, %r2396 }, [ %rd93 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2397, %r817;
mov.u32 %r2398, %r817;
mov.u32 %r2399, %r817;
mov.u32 %r2400, %r817;
@%p79 ld.global.v4.b32 { %r2397, %r2398, %r2399, %r2400 }, [ %rd94 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2401, %r817;
mov.u32 %r2402, %r817;
mov.u32 %r2403, %r817;
mov.u32 %r2404, %r817;
@%p9 ld.global.v4.b32 { %r2401, %r2402, %r2403, %r2404 }, [ %rd95 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2405, %r817;
mov.u32 %r2406, %r817;
mov.u32 %r2407, %r817;
mov.u32 %r2408, %r817;
@%p81 ld.global.v4.b32 { %r2405, %r2406, %r2407, %r2408 }, [ %rd96 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2409, %r817;
mov.u32 %r2410, %r817;
mov.u32 %r2411, %r817;
mov.u32 %r2412, %r817;
@%p82 ld.global.v4.b32 { %r2409, %r2410, %r2411, %r2412 }, [ %rd97 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2413, %r817;
mov.u32 %r2414, %r817;
mov.u32 %r2415, %r817;
mov.u32 %r2416, %r817;
@%p83 ld.global.v4.b32 { %r2413, %r2414, %r2415, %r2416 }, [ %rd98 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2417, %r817;
mov.u32 %r2418, %r817;
mov.u32 %r2419, %r817;
mov.u32 %r2420, %r817;
@%p84 ld.global.v4.b32 { %r2417, %r2418, %r2419, %r2420 }, [ %rd99 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2421, %r817;
mov.u32 %r2422, %r817;
mov.u32 %r2423, %r817;
mov.u32 %r2424, %r817;
@%p85 ld.global.v4.b32 { %r2421, %r2422, %r2423, %r2424 }, [ %rd100 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2425, %r817;
mov.u32 %r2426, %r817;
mov.u32 %r2427, %r817;
mov.u32 %r2428, %r817;
@%p86 ld.global.v4.b32 { %r2425, %r2426, %r2427, %r2428 }, [ %rd101 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2429, %r817;
mov.u32 %r2430, %r817;
mov.u32 %r2431, %r817;
mov.u32 %r2432, %r817;
@%p87 ld.global.v4.b32 { %r2429, %r2430, %r2431, %r2432 }, [ %rd102 + 0 ];
// end inline asm
// begin inline asm
mov.u32 %r2433, %r817;
mov.u32 %r2434, %r817;
mov.u32 %r2435, %r817;
mov.u32 %r2436, %r817;
@%p17 ld.global.v4.b32 { %r2433, %r2434, %r2435, %r2436 }, [ %rd103 + 0 ];
// end inline asm
.loc 1 119 23 // layer_norm.py:119:23
// begin inline asm
mov.u32 %r2372, 0x0;
ld.global.b32 { %r2372 }, [ %rd167 + 0 ];
// end inline asm
.loc 1 120 23 // layer_norm.py:120:23
// begin inline asm
mov.u32 %r2307, 0x0;
ld.global.b32 { %r2307 }, [ %rd166 + 0 ];
// end inline asm
$L__tmp1:
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
bar.sync 0;
$L__tmp2:
.loc 1 127 26 // layer_norm.py:127:26
add.s64 %rd106, %rd164, %rd28;
add.s64 %rd107, %rd106, 2048;
add.s64 %rd108, %rd106, 4096;
add.s64 %rd109, %rd106, 6144;
add.s64 %rd110, %rd106, 8192;
add.s64 %rd111, %rd106, 10240;
add.s64 %rd112, %rd106, 12288;
add.s64 %rd113, %rd164, %rd162;
add.s64 %rd114, %rd106, 16384;
add.s64 %rd115, %rd106, 18432;
add.s64 %rd116, %rd106, 20480;
add.s64 %rd117, %rd106, 22528;
add.s64 %rd118, %rd106, 24576;
add.s64 %rd119, %rd106, 26624;
add.s64 %rd120, %rd106, 28672;
.loc 1 116 20 // layer_norm.py:116:20
add.s64 %rd121, %rd164, %rd161;
.loc 1 122 21 // layer_norm.py:122:21
sub.f32 %r1408, %r2371, %r2372;
sub.f32 %r1409, %r2370, %r2372;
sub.f32 %r1410, %r2369, %r2372;
sub.f32 %r1411, %r2368, %r2372;
sub.f32 %r1412, %r2367, %r2372;
sub.f32 %r1413, %r2366, %r2372;
sub.f32 %r1414, %r2365, %r2372;
sub.f32 %r1415, %r2364, %r2372;
sub.f32 %r1416, %r2363, %r2372;
sub.f32 %r1417, %r2362, %r2372;
sub.f32 %r1418, %r2361, %r2372;
sub.f32 %r1419, %r2360, %r2372;
sub.f32 %r1420, %r2359, %r2372;
sub.f32 %r1421, %r2358, %r2372;
sub.f32 %r1422, %r2357, %r2372;
sub.f32 %r1423, %r2356, %r2372;
sub.f32 %r1424, %r2355, %r2372;
sub.f32 %r1425, %r2354, %r2372;
sub.f32 %r1426, %r2353, %r2372;
sub.f32 %r1427, %r2352, %r2372;
sub.f32 %r1428, %r2351, %r2372;
sub.f32 %r1429, %r2350, %r2372;
sub.f32 %r1430, %r2349, %r2372;
sub.f32 %r1431, %r2348, %r2372;
sub.f32 %r1432, %r2347, %r2372;
sub.f32 %r1433, %r2346, %r2372;
sub.f32 %r1434, %r2345, %r2372;
sub.f32 %r1435, %r2344, %r2372;
sub.f32 %r1436, %r2343, %r2372;
sub.f32 %r1437, %r2342, %r2372;
sub.f32 %r1438, %r2341, %r2372;
sub.f32 %r1439, %r2340, %r2372;
sub.f32 %r1440, %r2339, %r2372;
sub.f32 %r1441, %r2338, %r2372;
sub.f32 %r1442, %r2337, %r2372;
sub.f32 %r1443, %r2336, %r2372;
sub.f32 %r1444, %r2335, %r2372;
sub.f32 %r1445, %r2334, %r2372;
sub.f32 %r1446, %r2333, %r2372;
sub.f32 %r1447, %r2332, %r2372;
sub.f32 %r1448, %r2331, %r2372;
sub.f32 %r1449, %r2330, %r2372;
sub.f32 %r1450, %r2329, %r2372;
sub.f32 %r1451, %r2328, %r2372;
sub.f32 %r1452, %r2327, %r2372;
sub.f32 %r1453, %r2326, %r2372;
sub.f32 %r1454, %r2325, %r2372;
sub.f32 %r1455, %r2324, %r2372;
sub.f32 %r1456, %r2323, %r2372;
sub.f32 %r1457, %r2322, %r2372;
sub.f32 %r1458, %r2321, %r2372;
sub.f32 %r1459, %r2320, %r2372;
sub.f32 %r1460, %r2319, %r2372;
sub.f32 %r1461, %r2318, %r2372;
sub.f32 %r1462, %r2317, %r2372;
sub.f32 %r1463, %r2316, %r2372;
sub.f32 %r1464, %r2315, %r2372;
sub.f32 %r1465, %r2314, %r2372;
sub.f32 %r1466, %r2313, %r2372;
sub.f32 %r1467, %r2312, %r2372;
sub.f32 %r1468, %r2311, %r2372;
sub.f32 %r1469, %r2310, %r2372;
sub.f32 %r1470, %r2309, %r2372;
sub.f32 %r1471, %r2308, %r2372;
.loc 1 122 29 // layer_norm.py:122:29
mul.f32 %r1472, %r1471, %r2307;
mul.f32 %r1473, %r1470, %r2307;
mul.f32 %r1474, %r1469, %r2307;
mul.f32 %r1475, %r1468, %r2307;
mul.f32 %r1476, %r1467, %r2307;
mul.f32 %r1477, %r1466, %r2307;
mul.f32 %r1478, %r1465, %r2307;
mul.f32 %r1479, %r1464, %r2307;
mul.f32 %r1480, %r1463, %r2307;
mul.f32 %r1481, %r1462, %r2307;
mul.f32 %r1482, %r1461, %r2307;
mul.f32 %r1483, %r1460, %r2307;
mul.f32 %r1484, %r1459, %r2307;
mul.f32 %r1485, %r1458, %r2307;
mul.f32 %r1486, %r1457, %r2307;
mul.f32 %r1487, %r1456, %r2307;
mul.f32 %r1488, %r1455, %r2307;
mul.f32 %r1489, %r1454, %r2307;
mul.f32 %r1490, %r1453, %r2307;
mul.f32 %r1491, %r1452, %r2307;
mul.f32 %r1492, %r1451, %r2307;
mul.f32 %r1493, %r1450, %r2307;
mul.f32 %r1494, %r1449, %r2307;
mul.f32 %r1495, %r1448, %r2307;
mul.f32 %r1496, %r1447, %r2307;
mul.f32 %r1497, %r1446, %r2307;
mul.f32 %r1498, %r1445, %r2307;
mul.f32 %r1499, %r1444, %r2307;
mul.f32 %r1500, %r1443, %r2307;
mul.f32 %r1501, %r1442, %r2307;
mul.f32 %r1502, %r1441, %r2307;
mul.f32 %r1503, %r1440, %r2307;
mul.f32 %r1504, %r1439, %r2307;
mul.f32 %r1505, %r1438, %r2307;
mul.f32 %r1506, %r1437, %r2307;
mul.f32 %r1507, %r1436, %r2307;
mul.f32 %r1508, %r1435, %r2307;
mul.f32 %r1509, %r1434, %r2307;
mul.f32 %r1510, %r1433, %r2307;
mul.f32 %r1511, %r1432, %r2307;
mul.f32 %r1512, %r1431, %r2307;
mul.f32 %r1513, %r1430, %r2307;
mul.f32 %r1514, %r1429, %r2307;
mul.f32 %r1515, %r1428, %r2307;
mul.f32 %r1516, %r1427, %r2307;
mul.f32 %r1517, %r1426, %r2307;
mul.f32 %r1518, %r1425, %r2307;
mul.f32 %r1519, %r1424, %r2307;
mul.f32 %r1520, %r1423, %r2307;
mul.f32 %r1521, %r1422, %r2307;
mul.f32 %r1522, %r1421, %r2307;
mul.f32 %r1523, %r1420, %r2307;
mul.f32 %r1524, %r1419, %r2307;
mul.f32 %r1525, %r1418, %r2307;
mul.f32 %r1526, %r1417, %r2307;
mul.f32 %r1527, %r1416, %r2307;
mul.f32 %r1528, %r1415, %r2307;
mul.f32 %r1529, %r1414, %r2307;
mul.f32 %r1530, %r1413, %r2307;
mul.f32 %r1531, %r1412, %r2307;
mul.f32 %r1532, %r1411, %r2307;
mul.f32 %r1533, %r1410, %r2307;
mul.f32 %r1534, %r1409, %r2307;
mul.f32 %r1535, %r1408, %r2307;
.loc 1 123 18 // layer_norm.py:123:18
mul.f32 %r1537, %r2243, %r2373;
mul.f32 %r1539, %r2244, %r2374;
mul.f32 %r1541, %r2245, %r2375;
mul.f32 %r1543, %r2246, %r2376;
mul.f32 %r1545, %r2247, %r2377;
mul.f32 %r1547, %r2248, %r2378;
mul.f32 %r1549, %r2249, %r2379;
mul.f32 %r1551, %r2250, %r2380;
mul.f32 %r1553, %r2251, %r2381;
mul.f32 %r1555, %r2252, %r2382;
mul.f32 %r1557, %r2253, %r2383;
mul.f32 %r1559, %r2254, %r2384;
mul.f32 %r1561, %r2255, %r2385;
mul.f32 %r1563, %r2256, %r2386;
mul.f32 %r1565, %r2257, %r2387;
mul.f32 %r1567, %r2258, %r2388;
mul.f32 %r1569, %r2259, %r2389;
mul.f32 %r1571, %r2260, %r2390;
mul.f32 %r1573, %r2261, %r2391;
mul.f32 %r1575, %r2262, %r2392;
mul.f32 %r1577, %r2263, %r2393;
mul.f32 %r1579, %r2264, %r2394;
mul.f32 %r1581, %r2265, %r2395;
mul.f32 %r1583, %r2266, %r2396;
mul.f32 %r1585, %r2267, %r2397;
mul.f32 %r1587, %r2268, %r2398;
mul.f32 %r1589, %r2269, %r2399;
mul.f32 %r1591, %r2270, %r2400;
mul.f32 %r1593, %r2271, %r2401;
mul.f32 %r1595, %r2272, %r2402;
mul.f32 %r1597, %r2273, %r2403;
mul.f32 %r1599, %r2274, %r2404;
mul.f32 %r1601, %r2275, %r2405;
mul.f32 %r1603, %r2276, %r2406;
mul.f32 %r1605, %r2277, %r2407;
mul.f32 %r1607, %r2278, %r2408;
mul.f32 %r1609, %r2279, %r2409;
mul.f32 %r1611, %r2280, %r2410;
mul.f32 %r1613, %r2281, %r2411;
mul.f32 %r1615, %r2282, %r2412;
mul.f32 %r1617, %r2283, %r2413;
mul.f32 %r1619, %r2284, %r2414;
mul.f32 %r1621, %r2285, %r2415;
mul.f32 %r1623, %r2286, %r2416;
mul.f32 %r1625, %r2287, %r2417;
mul.f32 %r1627, %r2288, %r2418;
mul.f32 %r1629, %r2289, %r2419;
mul.f32 %r1631, %r2290, %r2420;
mul.f32 %r1633, %r2291, %r2421;
mul.f32 %r1635, %r2292, %r2422;
mul.f32 %r1637, %r2293, %r2423;
mul.f32 %r1639, %r2294, %r2424;
mul.f32 %r1641, %r2295, %r2425;
mul.f32 %r1643, %r2296, %r2426;
mul.f32 %r1645, %r2297, %r2427;
mul.f32 %r1647, %r2298, %r2428;
mul.f32 %r1649, %r2299, %r2429;
mul.f32 %r1651, %r2300, %r2430;
mul.f32 %r1653, %r2301, %r2431;
mul.f32 %r1655, %r2302, %r2432;
mul.f32 %r1657, %r2303, %r2433;
mul.f32 %r1659, %r2304, %r2434;
mul.f32 %r1661, %r2305, %r2435;
mul.f32 %r1663, %r2306, %r2436;
.loc 1 124 28 // layer_norm.py:124:28
mul.f32 %r1664, %r1539, %r1535;
$L__tmp3:
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
fma.rn.f32 %r1665, %r1537, %r1534, %r1664;
fma.rn.f32 %r1666, %r1541, %r1533, %r1665;
fma.rn.f32 %r1667, %r1543, %r1532, %r1666;
fma.rn.f32 %r1668, %r1545, %r1531, %r1667;
fma.rn.f32 %r1669, %r1547, %r1530, %r1668;
fma.rn.f32 %r1670, %r1549, %r1529, %r1669;
fma.rn.f32 %r1671, %r1551, %r1528, %r1670;
fma.rn.f32 %r1672, %r1553, %r1527, %r1671;
fma.rn.f32 %r1673, %r1555, %r1526, %r1672;
fma.rn.f32 %r1674, %r1557, %r1525, %r1673;
fma.rn.f32 %r1675, %r1559, %r1524, %r1674;
fma.rn.f32 %r1676, %r1561, %r1523, %r1675;
fma.rn.f32 %r1677, %r1563, %r1522, %r1676;
fma.rn.f32 %r1678, %r1565, %r1521, %r1677;
fma.rn.f32 %r1679, %r1567, %r1520, %r1678;
fma.rn.f32 %r1680, %r1569, %r1519, %r1679;
fma.rn.f32 %r1681, %r1571, %r1518, %r1680;
fma.rn.f32 %r1682, %r1573, %r1517, %r1681;
fma.rn.f32 %r1683, %r1575, %r1516, %r1682;
fma.rn.f32 %r1684, %r1577, %r1515, %r1683;
fma.rn.f32 %r1685, %r1579, %r1514, %r1684;
fma.rn.f32 %r1686, %r1581, %r1513, %r1685;
fma.rn.f32 %r1687, %r1583, %r1512, %r1686;
fma.rn.f32 %r1688, %r1585, %r1511, %r1687;
fma.rn.f32 %r1689, %r1587, %r1510, %r1688;
fma.rn.f32 %r1690, %r1589, %r1509, %r1689;
fma.rn.f32 %r1691, %r1591, %r1508, %r1690;
fma.rn.f32 %r1692, %r1593, %r1507, %r1691;
fma.rn.f32 %r1693, %r1595, %r1506, %r1692;
fma.rn.f32 %r1694, %r1597, %r1505, %r1693;
fma.rn.f32 %r1695, %r1599, %r1504, %r1694;
fma.rn.f32 %r1696, %r1601, %r1503, %r1695;
fma.rn.f32 %r1697, %r1603, %r1502, %r1696;
fma.rn.f32 %r1698, %r1605, %r1501, %r1697;
fma.rn.f32 %r1699, %r1607, %r1500, %r1698;
fma.rn.f32 %r1700, %r1609, %r1499, %r1699;
fma.rn.f32 %r1701, %r1611, %r1498, %r1700;
fma.rn.f32 %r1702, %r1613, %r1497, %r1701;
fma.rn.f32 %r1703, %r1615, %r1496, %r1702;
fma.rn.f32 %r1704, %r1617, %r1495, %r1703;
fma.rn.f32 %r1705, %r1619, %r1494, %r1704;
fma.rn.f32 %r1706, %r1621, %r1493, %r1705;
fma.rn.f32 %r1707, %r1623, %r1492, %r1706;
fma.rn.f32 %r1708, %r1625, %r1491, %r1707;
fma.rn.f32 %r1709, %r1627, %r1490, %r1708;
fma.rn.f32 %r1710, %r1629, %r1489, %r1709;
fma.rn.f32 %r1711, %r1631, %r1488, %r1710;
fma.rn.f32 %r1712, %r1633, %r1487, %r1711;
fma.rn.f32 %r1713, %r1635, %r1486, %r1712;
fma.rn.f32 %r1714, %r1637, %r1485, %r1713;
fma.rn.f32 %r1715, %r1639, %r1484, %r1714;
fma.rn.f32 %r1716, %r1641, %r1483, %r1715;
fma.rn.f32 %r1717, %r1643, %r1482, %r1716;
fma.rn.f32 %r1718, %r1645, %r1481, %r1717;
fma.rn.f32 %r1719, %r1647, %r1480, %r1718;
fma.rn.f32 %r1720, %r1649, %r1479, %r1719;
fma.rn.f32 %r1721, %r1651, %r1478, %r1720;
fma.rn.f32 %r1722, %r1653, %r1477, %r1721;
fma.rn.f32 %r1723, %r1655, %r1476, %r1722;
fma.rn.f32 %r1724, %r1657, %r1475, %r1723;
fma.rn.f32 %r1725, %r1659, %r1474, %r1724;
fma.rn.f32 %r1726, %r1661, %r1473, %r1725;
fma.rn.f32 %r2437, %r1663, %r1472, %r1726;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
shfl.sync.bfly.b32 %r2438, %r2437, 16, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %r2439, %r2437, %r2438;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
shfl.sync.bfly.b32 %r2440, %r2439, 8, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %r2441, %r2439, %r2440;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
shfl.sync.bfly.b32 %r2442, %r2441, 4, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %r2443, %r2441, %r2442;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
shfl.sync.bfly.b32 %r2444, %r2443, 2, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %r2445, %r2443, %r2444;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
shfl.sync.bfly.b32 %r2446, %r2445, 1, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %r2447, %r2445, %r2446;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
// begin inline asm
@%p50 st.shared.b32 [ %r1199 + 0 ], %r2447;
// end inline asm
bar.sync 0;
// begin inline asm
@%p51 ld.shared.b32 %r2448, [ %r1208 + 0 ];
// end inline asm
shfl.sync.bfly.b32 %r2449, %r2448, 2, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %r2450, %r2448, %r2449;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
shfl.sync.bfly.b32 %r2451, %r2450, 1, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:124:20 ]
add.f32 %r2452, %r2450, %r2451;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:124:20 ]
// begin inline asm
@%p52 st.shared.b32 [ %r1208 + 0 ], %r2452;
// end inline asm
bar.sync 0;
ld.shared.b32 %r1756, [global_smem];
$L__tmp4:
.loc 1 124 43 // layer_norm.py:124:43
div.full.f32 %r1757, %r1756, %r18;
$L__tmp5:
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
bar.sync 0;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
fma.rn.f32 %r1758, %r2243, %r2373, %r1539;
fma.rn.f32 %r1759, %r2245, %r2375, %r1758;
fma.rn.f32 %r1760, %r2246, %r2376, %r1759;
fma.rn.f32 %r1761, %r2247, %r2377, %r1760;
fma.rn.f32 %r1762, %r2248, %r2378, %r1761;
fma.rn.f32 %r1763, %r2249, %r2379, %r1762;
fma.rn.f32 %r1764, %r2250, %r2380, %r1763;
fma.rn.f32 %r1765, %r2251, %r2381, %r1764;
fma.rn.f32 %r1766, %r2252, %r2382, %r1765;
fma.rn.f32 %r1767, %r2253, %r2383, %r1766;
fma.rn.f32 %r1768, %r2254, %r2384, %r1767;
fma.rn.f32 %r1769, %r2255, %r2385, %r1768;
fma.rn.f32 %r1770, %r2256, %r2386, %r1769;
fma.rn.f32 %r1771, %r2257, %r2387, %r1770;
fma.rn.f32 %r1772, %r2258, %r2388, %r1771;
fma.rn.f32 %r1773, %r2259, %r2389, %r1772;
fma.rn.f32 %r1774, %r2260, %r2390, %r1773;
fma.rn.f32 %r1775, %r2261, %r2391, %r1774;
fma.rn.f32 %r1776, %r2262, %r2392, %r1775;
fma.rn.f32 %r1777, %r2263, %r2393, %r1776;
fma.rn.f32 %r1778, %r2264, %r2394, %r1777;
fma.rn.f32 %r1779, %r2265, %r2395, %r1778;
fma.rn.f32 %r1780, %r2266, %r2396, %r1779;
fma.rn.f32 %r1781, %r2267, %r2397, %r1780;
fma.rn.f32 %r1782, %r2268, %r2398, %r1781;
fma.rn.f32 %r1783, %r2269, %r2399, %r1782;
fma.rn.f32 %r1784, %r2270, %r2400, %r1783;
fma.rn.f32 %r1785, %r2271, %r2401, %r1784;
fma.rn.f32 %r1786, %r2272, %r2402, %r1785;
fma.rn.f32 %r1787, %r2273, %r2403, %r1786;
fma.rn.f32 %r1788, %r2274, %r2404, %r1787;
fma.rn.f32 %r1789, %r2275, %r2405, %r1788;
fma.rn.f32 %r1790, %r2276, %r2406, %r1789;
fma.rn.f32 %r1791, %r2277, %r2407, %r1790;
fma.rn.f32 %r1792, %r2278, %r2408, %r1791;
fma.rn.f32 %r1793, %r2279, %r2409, %r1792;
fma.rn.f32 %r1794, %r2280, %r2410, %r1793;
fma.rn.f32 %r1795, %r2281, %r2411, %r1794;
fma.rn.f32 %r1796, %r2282, %r2412, %r1795;
fma.rn.f32 %r1797, %r2283, %r2413, %r1796;
fma.rn.f32 %r1798, %r2284, %r2414, %r1797;
fma.rn.f32 %r1799, %r2285, %r2415, %r1798;
fma.rn.f32 %r1800, %r2286, %r2416, %r1799;
fma.rn.f32 %r1801, %r2287, %r2417, %r1800;
fma.rn.f32 %r1802, %r2288, %r2418, %r1801;
fma.rn.f32 %r1803, %r2289, %r2419, %r1802;
fma.rn.f32 %r1804, %r2290, %r2420, %r1803;
fma.rn.f32 %r1805, %r2291, %r2421, %r1804;
fma.rn.f32 %r1806, %r2292, %r2422, %r1805;
fma.rn.f32 %r1807, %r2293, %r2423, %r1806;
fma.rn.f32 %r1808, %r2294, %r2424, %r1807;
fma.rn.f32 %r1809, %r2295, %r2425, %r1808;
fma.rn.f32 %r1810, %r2296, %r2426, %r1809;
fma.rn.f32 %r1811, %r2297, %r2427, %r1810;
fma.rn.f32 %r1812, %r2298, %r2428, %r1811;
fma.rn.f32 %r1813, %r2299, %r2429, %r1812;
fma.rn.f32 %r1814, %r2300, %r2430, %r1813;
fma.rn.f32 %r1815, %r2301, %r2431, %r1814;
fma.rn.f32 %r1816, %r2302, %r2432, %r1815;
fma.rn.f32 %r1817, %r2303, %r2433, %r1816;
fma.rn.f32 %r1818, %r2304, %r2434, %r1817;
fma.rn.f32 %r1819, %r2305, %r2435, %r1818;
fma.rn.f32 %r2453, %r2306, %r2436, %r1819;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
shfl.sync.bfly.b32 %r2454, %r2453, 16, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %r2455, %r2453, %r2454;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
shfl.sync.bfly.b32 %r2456, %r2455, 8, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %r2457, %r2455, %r2456;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
shfl.sync.bfly.b32 %r2458, %r2457, 4, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %r2459, %r2457, %r2458;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
shfl.sync.bfly.b32 %r2460, %r2459, 2, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %r2461, %r2459, %r2460;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
shfl.sync.bfly.b32 %r2462, %r2461, 1, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %r2463, %r2461, %r2462;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
// begin inline asm
@%p50 st.shared.b32 [ %r1199 + 0 ], %r2463;
// end inline asm
bar.sync 0;
// begin inline asm
@%p51 ld.shared.b32 %r2464, [ %r1208 + 0 ];
// end inline asm
shfl.sync.bfly.b32 %r2465, %r2464, 2, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %r2466, %r2464, %r2465;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
shfl.sync.bfly.b32 %r2467, %r2466, 1, 31, -1;
.loc 2 260 15 // standard.py:260:15 @[ layer_norm.py:125:20 ]
add.f32 %r2468, %r2466, %r2467;
.loc 2 290 36 // standard.py:290:36 @[ layer_norm.py:125:20 ]
// begin inline asm
@%p52 st.shared.b32 [ %r1208 + 0 ], %r2468;
// end inline asm
bar.sync 0;
ld.shared.b32 %r1849, [global_smem];
$L__tmp6:
.loc 1 125 35 // layer_norm.py:125:35
div.full.f32 %r1850, %r1849, %r18;
.loc 1 126 34 // layer_norm.py:126:34
fma.rn.f32 %r1851, %r1534, %r1757, %r1850;
fma.rn.f32 %r1852, %r1535, %r1757, %r1850;
fma.rn.f32 %r1853, %r1533, %r1757, %r1850;
fma.rn.f32 %r1854, %r1532, %r1757, %r1850;
fma.rn.f32 %r1855, %r1531, %r1757, %r1850;
fma.rn.f32 %r1856, %r1530, %r1757, %r1850;
fma.rn.f32 %r1857, %r1529, %r1757, %r1850;
fma.rn.f32 %r1858, %r1528, %r1757, %r1850;
fma.rn.f32 %r1859, %r1527, %r1757, %r1850;
fma.rn.f32 %r1860, %r1526, %r1757, %r1850;
fma.rn.f32 %r1861, %r1525, %r1757, %r1850;
fma.rn.f32 %r1862, %r1524, %r1757, %r1850;
fma.rn.f32 %r1863, %r1523, %r1757, %r1850;
fma.rn.f32 %r1864, %r1522, %r1757, %r1850;
fma.rn.f32 %r1865, %r1521, %r1757, %r1850;
fma.rn.f32 %r1866, %r1520, %r1757, %r1850;
fma.rn.f32 %r1867, %r1519, %r1757, %r1850;
fma.rn.f32 %r1868, %r1518, %r1757, %r1850;
fma.rn.f32 %r1869, %r1517, %r1757, %r1850;
fma.rn.f32 %r1870, %r1516, %r1757, %r1850;
fma.rn.f32 %r1871, %r1515, %r1757, %r1850;
fma.rn.f32 %r1872, %r1514, %r1757, %r1850;
fma.rn.f32 %r1873, %r1513, %r1757, %r1850;
fma.rn.f32 %r1874, %r1512, %r1757, %r1850;
fma.rn.f32 %r1875, %r1511, %r1757, %r1850;
fma.rn.f32 %r1876, %r1510, %r1757, %r1850;
fma.rn.f32 %r1877, %r1509, %r1757, %r1850;
fma.rn.f32 %r1878, %r1508, %r1757, %r1850;
fma.rn.f32 %r1879, %r1507, %r1757, %r1850;
fma.rn.f32 %r1880, %r1506, %r1757, %r1850;
fma.rn.f32 %r1881, %r1505, %r1757, %r1850;
fma.rn.f32 %r1882, %r1504, %r1757, %r1850;
fma.rn.f32 %r1883, %r1503, %r1757, %r1850;
fma.rn.f32 %r1884, %r1502, %r1757, %r1850;
fma.rn.f32 %r1885, %r1501, %r1757, %r1850;
fma.rn.f32 %r1886, %r1500, %r1757, %r1850;
fma.rn.f32 %r1887, %r1499, %r1757, %r1850;
fma.rn.f32 %r1888, %r1498, %r1757, %r1850;
fma.rn.f32 %r1889, %r1497, %r1757, %r1850;
fma.rn.f32 %r1890, %r1496, %r1757, %r1850;
fma.rn.f32 %r1891, %r1495, %r1757, %r1850;
fma.rn.f32 %r1892, %r1494, %r1757, %r1850;
fma.rn.f32 %r1893, %r1493, %r1757, %r1850;
fma.rn.f32 %r1894, %r1492, %r1757, %r1850;
fma.rn.f32 %r1895, %r1491, %r1757, %r1850;
fma.rn.f32 %r1896, %r1490, %r1757, %r1850;
fma.rn.f32 %r1897, %r1489, %r1757, %r1850;
fma.rn.f32 %r1898, %r1488, %r1757, %r1850;
fma.rn.f32 %r1899, %r1487, %r1757, %r1850;
fma.rn.f32 %r1900, %r1486, %r1757, %r1850;
fma.rn.f32 %r1901, %r1485, %r1757, %r1850;
fma.rn.f32 %r1902, %r1484, %r1757, %r1850;
fma.rn.f32 %r1903, %r1483, %r1757, %r1850;
fma.rn.f32 %r1904, %r1482, %r1757, %r1850;
fma.rn.f32 %r1905, %r1481, %r1757, %r1850;
fma.rn.f32 %r1906, %r1480, %r1757, %r1850;
fma.rn.f32 %r1907, %r1479, %r1757, %r1850;
fma.rn.f32 %r1908, %r1478, %r1757, %r1850;
fma.rn.f32 %r1909, %r1477, %r1757, %r1850;
fma.rn.f32 %r1910, %r1476, %r1757, %r1850;
fma.rn.f32 %r1911, %r1475, %r1757, %r1850;
fma.rn.f32 %r1912, %r1474, %r1757, %r1850;
fma.rn.f32 %r1913, %r1473, %r1757, %r1850;
fma.rn.f32 %r1914, %r1472, %r1757, %r1850;
.loc 1 126 21 // layer_norm.py:126:21
neg.f32 %r1915, %r1851;
fma.rn.f32 %r1916, %r2243, %r2373, %r1915;
neg.f32 %r1917, %r1852;
fma.rn.f32 %r1918, %r2244, %r2374, %r1917;
neg.f32 %r1919, %r1853;
fma.rn.f32 %r1920, %r2245, %r2375, %r1919;
neg.f32 %r1921, %r1854;
fma.rn.f32 %r1922, %r2246, %r2376, %r1921;
neg.f32 %r1923, %r1855;
fma.rn.f32 %r1924, %r2247, %r2377, %r1923;
neg.f32 %r1925, %r1856;
fma.rn.f32 %r1926, %r2248, %r2378, %r1925;
neg.f32 %r1927, %r1857;
fma.rn.f32 %r1928, %r2249, %r2379, %r1927;
neg.f32 %r1929, %r1858;
fma.rn.f32 %r1930, %r2250, %r2380, %r1929;
neg.f32 %r1931, %r1859;
fma.rn.f32 %r1932, %r2251, %r2381, %r1931;
neg.f32 %r1933, %r1860;
fma.rn.f32 %r1934, %r2252, %r2382, %r1933;
neg.f32 %r1935, %r1861;
fma.rn.f32 %r1936, %r2253, %r2383, %r1935;
neg.f32 %r1937, %r1862;
fma.rn.f32 %r1938, %r2254, %r2384, %r1937;
neg.f32 %r1939, %r1863;
fma.rn.f32 %r1940, %r2255, %r2385, %r1939;
neg.f32 %r1941, %r1864;
fma.rn.f32 %r1942, %r2256, %r2386, %r1941;
neg.f32 %r1943, %r1865;
fma.rn.f32 %r1944, %r2257, %r2387, %r1943;
neg.f32 %r1945, %r1866;
fma.rn.f32 %r1946, %r2258, %r2388, %r1945;
neg.f32 %r1947, %r1867;
fma.rn.f32 %r1948, %r2259, %r2389, %r1947;
neg.f32 %r1949, %r1868;
fma.rn.f32 %r1950, %r2260, %r2390, %r1949;
neg.f32 %r1951, %r1869;
fma.rn.f32 %r1952, %r2261, %r2391, %r1951;
neg.f32 %r1953, %r1870;
fma.rn.f32 %r1954, %r2262, %r2392, %r1953;
neg.f32 %r1955, %r1871;
fma.rn.f32 %r1956, %r2263, %r2393, %r1955;
neg.f32 %r1957, %r1872;
fma.rn.f32 %r1958, %r2264, %r2394, %r1957;
neg.f32 %r1959, %r1873;
fma.rn.f32 %r1960, %r2265, %r2395, %r1959;
neg.f32 %r1961, %r1874;
fma.rn.f32 %r1962, %r2266, %r2396, %r1961;
neg.f32 %r1963, %r1875;
fma.rn.f32 %r1964, %r2267, %r2397, %r1963;
neg.f32 %r1965, %r1876;
fma.rn.f32 %r1966, %r2268, %r2398, %r1965;
neg.f32 %r1967, %r1877;
fma.rn.f32 %r1968, %r2269, %r2399, %r1967;
neg.f32 %r1969, %r1878;
fma.rn.f32 %r1970, %r2270, %r2400, %r1969;
neg.f32 %r1971, %r1879;
fma.rn.f32 %r1972, %r2271, %r2401, %r1971;
neg.f32 %r1973, %r1880;
fma.rn.f32 %r1974, %r2272, %r2402, %r1973;
neg.f32 %r1975, %r1881;
fma.rn.f32 %r1976, %r2273, %r2403, %r1975;
neg.f32 %r1977, %r1882;
fma.rn.f32 %r1978, %r2274, %r2404, %r1977;
neg.f32 %r1979, %r1883;
fma.rn.f32 %r1980, %r2275, %r2405, %r1979;
neg.f32 %r1981, %r1884;
fma.rn.f32 %r1982, %r2276, %r2406, %r1981;
neg.f32 %r1983, %r1885;
fma.rn.f32 %r1984, %r2277, %r2407, %r1983;
neg.f32 %r1985, %r1886;
fma.rn.f32 %r1986, %r2278, %r2408, %r1985;
neg.f32 %r1987, %r1887;
fma.rn.f32 %r1988, %r2279, %r2409, %r1987;
neg.f32 %r1989, %r1888;
fma.rn.f32 %r1990, %r2280, %r2410, %r1989;
neg.f32 %r1991, %r1889;
fma.rn.f32 %r1992, %r2281, %r2411, %r1991;
neg.f32 %r1993, %r1890;
fma.rn.f32 %r1994, %r2282, %r2412, %r1993;
neg.f32 %r1995, %r1891;
fma.rn.f32 %r1996, %r2283, %r2413, %r1995;
neg.f32 %r1997, %r1892;
fma.rn.f32 %r1998, %r2284, %r2414, %r1997;
neg.f32 %r1999, %r1893;
fma.rn.f32 %r2000, %r2285, %r2415, %r1999;
neg.f32 %r2001, %r1894;
fma.rn.f32 %r2002, %r2286, %r2416, %r2001;
neg.f32 %r2003, %r1895;
fma.rn.f32 %r2004, %r2287, %r2417, %r2003;
neg.f32 %r2005, %r1896;
fma.rn.f32 %r2006, %r2288, %r2418, %r2005;
neg.f32 %r2007, %r1897;
fma.rn.f32 %r2008, %r2289, %r2419, %r2007;
neg.f32 %r2009, %r1898;
fma.rn.f32 %r2010, %r2290, %r2420, %r2009;
neg.f32 %r2011, %r1899;
fma.rn.f32 %r2012, %r2291, %r2421, %r2011;
neg.f32 %r2013, %r1900;
fma.rn.f32 %r2014, %r2292, %r2422, %r2013;
neg.f32 %r2015, %r1901;
fma.rn.f32 %r2016, %r2293, %r2423, %r2015;
neg.f32 %r2017, %r1902;
fma.rn.f32 %r2018, %r2294, %r2424, %r2017;
neg.f32 %r2019, %r1903;
fma.rn.f32 %r2020, %r2295, %r2425, %r2019;
neg.f32 %r2021, %r1904;
fma.rn.f32 %r2022, %r2296, %r2426, %r2021;
neg.f32 %r2023, %r1905;
fma.rn.f32 %r2024, %r2297, %r2427, %r2023;
neg.f32 %r2025, %r1906;
fma.rn.f32 %r2026, %r2298, %r2428, %r2025;
neg.f32 %r2027, %r1907;
fma.rn.f32 %r2028, %r2299, %r2429, %r2027;
neg.f32 %r2029, %r1908;
fma.rn.f32 %r2030, %r2300, %r2430, %r2029;
neg.f32 %r2031, %r1909;
fma.rn.f32 %r2032, %r2301, %r2431, %r2031;
neg.f32 %r2033, %r1910;
fma.rn.f32 %r2034, %r2302, %r2432, %r2033;
neg.f32 %r2035, %r1911;
fma.rn.f32 %r2036, %r2303, %r2433, %r2035;
neg.f32 %r2037, %r1912;
fma.rn.f32 %r2038, %r2304, %r2434, %r2037;
neg.f32 %r2039, %r1913;
fma.rn.f32 %r2040, %r2305, %r2435, %r2039;
neg.f32 %r2041, %r1914;
fma.rn.f32 %r2042, %r2306, %r2436, %r2041;
.loc 1 126 41 // layer_norm.py:126:41
mul.f32 %r2469, %r1916, %r2307;
mul.f32 %r2470, %r1918, %r2307;
mul.f32 %r2471, %r1920, %r2307;
mul.f32 %r2472, %r1922, %r2307;
mul.f32 %r2473, %r1924, %r2307;
mul.f32 %r2474, %r1926, %r2307;
mul.f32 %r2475, %r1928, %r2307;
mul.f32 %r2476, %r1930, %r2307;
mul.f32 %r2477, %r1932, %r2307;
mul.f32 %r2478, %r1934, %r2307;
mul.f32 %r2479, %r1936, %r2307;
mul.f32 %r2480, %r1938, %r2307;
mul.f32 %r2481, %r1940, %r2307;
mul.f32 %r2482, %r1942, %r2307;
mul.f32 %r2483, %r1944, %r2307;
mul.f32 %r2484, %r1946, %r2307;
mul.f32 %r2485, %r1948, %r2307;
mul.f32 %r2486, %r1950, %r2307;
mul.f32 %r2487, %r1952, %r2307;
mul.f32 %r2488, %r1954, %r2307;
mul.f32 %r2489, %r1956, %r2307;
mul.f32 %r2490, %r1958, %r2307;
mul.f32 %r2491, %r1960, %r2307;
mul.f32 %r2492, %r1962, %r2307;
mul.f32 %r2493, %r1964, %r2307;
mul.f32 %r2494, %r1966, %r2307;
mul.f32 %r2495, %r1968, %r2307;
mul.f32 %r2496, %r1970, %r2307;
mul.f32 %r2497, %r1972, %r2307;
mul.f32 %r2498, %r1974, %r2307;
mul.f32 %r2499, %r1976, %r2307;
mul.f32 %r2500, %r1978, %r2307;
mul.f32 %r2501, %r1980, %r2307;
mul.f32 %r2502, %r1982, %r2307;
mul.f32 %r2503, %r1984, %r2307;
mul.f32 %r2504, %r1986, %r2307;
mul.f32 %r2505, %r1988, %r2307;
mul.f32 %r2506, %r1990, %r2307;
mul.f32 %r2507, %r1992, %r2307;
mul.f32 %r2508, %r1994, %r2307;
mul.f32 %r2509, %r1996, %r2307;
mul.f32 %r2510, %r1998, %r2307;
mul.f32 %r2511, %r2000, %r2307;
mul.f32 %r2512, %r2002, %r2307;
mul.f32 %r2513, %r2004, %r2307;
mul.f32 %r2514, %r2006, %r2307;
mul.f32 %r2515, %r2008, %r2307;
mul.f32 %r2516, %r2010, %r2307;
mul.f32 %r2517, %r2012, %r2307;
mul.f32 %r2518, %r2014, %r2307;
mul.f32 %r2519, %r2016, %r2307;
mul.f32 %r2520, %r2018, %r2307;
mul.f32 %r2521, %r2020, %r2307;
mul.f32 %r2522, %r2022, %r2307;
mul.f32 %r2523, %r2024, %r2307;
mul.f32 %r2524, %r2026, %r2307;
mul.f32 %r2525, %r2028, %r2307;
mul.f32 %r2526, %r2030, %r2307;
mul.f32 %r2527, %r2032, %r2307;
mul.f32 %r2528, %r2034, %r2307;
mul.f32 %r2529, %r2036, %r2307;
mul.f32 %r2530, %r2038, %r2307;
mul.f32 %r2531, %r2040, %r2307;
mul.f32 %r2532, %r2042, %r2307;
.loc 1 127 32 // layer_norm.py:127:32
// begin inline asm
@%p2 st.global.v4.b32 [ %rd106 + 0 ], { %r2469, %r2470, %r2471, %r2472 };
// end inline asm
// begin inline asm
@%p74 st.global.v4.b32 [ %rd107 + 0 ], { %r2473, %r2474, %r2475, %r2476 };
// end inline asm
// begin inline asm
@%p75 st.global.v4.b32 [ %rd108 + 0 ], { %r2477, %r2478, %r2479, %r2480 };
// end inline asm
// begin inline asm
@%p76 st.global.v4.b32 [ %rd109 + 0 ], { %r2481, %r2482, %r2483, %r2484 };
// end inline asm
// begin inline asm
@%p77 st.global.v4.b32 [ %rd110 + 0 ], { %r2485, %r2486, %r2487, %r2488 };
// end inline asm
// begin inline asm
@%p78 st.global.v4.b32 [ %rd111 + 0 ], { %r2489, %r2490, %r2491, %r2492 };
// end inline asm
// begin inline asm
@%p79 st.global.v4.b32 [ %rd112 + 0 ], { %r2493, %r2494, %r2495, %r2496 };
// end inline asm
// begin inline asm
@%p9 st.global.v4.b32 [ %rd113 + 0 ], { %r2497, %r2498, %r2499, %r2500 };
// end inline asm
// begin inline asm
@%p81 st.global.v4.b32 [ %rd114 + 0 ], { %r2501, %r2502, %r2503, %r2504 };
// end inline asm
// begin inline asm
@%p82 st.global.v4.b32 [ %rd115 + 0 ], { %r2505, %r2506, %r2507, %r2508 };
// end inline asm
// begin inline asm
@%p83 st.global.v4.b32 [ %rd116 + 0 ], { %r2509, %r2510, %r2511, %r2512 };
// end inline asm
// begin inline asm
@%p84 st.global.v4.b32 [ %rd117 + 0 ], { %r2513, %r2514, %r2515, %r2516 };
// end inline asm
// begin inline asm
@%p85 st.global.v4.b32 [ %rd118 + 0 ], { %r2517, %r2518, %r2519, %r2520 };
// end inline asm
// begin inline asm
@%p86 st.global.v4.b32 [ %rd119 + 0 ], { %r2521, %r2522, %r2523, %r2524 };
// end inline asm
// begin inline asm
@%p87 st.global.v4.b32 [ %rd120 + 0 ], { %r2525, %r2526, %r2527, %r2528 };
// end inline asm
// begin inline asm
@%p17 st.global.v4.b32 [ %rd121 + 0 ], { %r2529, %r2530, %r2531, %r2532 };
// end inline asm
.loc 1 129 18 // layer_norm.py:129:18
fma.rn.f32 %r2844, %r1481, %r2427, %r2844;
fma.rn.f32 %r2843, %r1482, %r2426, %r2843;
fma.rn.f32 %r2842, %r1483, %r2425, %r2842;
fma.rn.f32 %r2841, %r1484, %r2424, %r2841;
fma.rn.f32 %r2840, %r1485, %r2423, %r2840;
fma.rn.f32 %r2839, %r1486, %r2422, %r2839;
fma.rn.f32 %r2838, %r1487, %r2421, %r2838;
fma.rn.f32 %r2837, %r1488, %r2420, %r2837;
fma.rn.f32 %r2836, %r1489, %r2419, %r2836;
fma.rn.f32 %r2835, %r1490, %r2418, %r2835;
fma.rn.f32 %r2834, %r1491, %r2417, %r2834;
fma.rn.f32 %r2833, %r1492, %r2416, %r2833;
fma.rn.f32 %r2832, %r1493, %r2415, %r2832;
fma.rn.f32 %r2831, %r1494, %r2414, %r2831;
fma.rn.f32 %r2830, %r1495, %r2413, %r2830;
fma.rn.f32 %r2829, %r1496, %r2412, %r2829;
fma.rn.f32 %r2828, %r1497, %r2411, %r2828;
fma.rn.f32 %r2827, %r1498, %r2410, %r2827;
fma.rn.f32 %r2826, %r1499, %r2409, %r2826;
fma.rn.f32 %r2825, %r1500, %r2408, %r2825;
fma.rn.f32 %r2824, %r1501, %r2407, %r2824;
fma.rn.f32 %r2823, %r1502, %r2406, %r2823;
fma.rn.f32 %r2822, %r1503, %r2405, %r2822;
fma.rn.f32 %r2821, %r1504, %r2404, %r2821;
fma.rn.f32 %r2820, %r1505, %r2403, %r2820;
fma.rn.f32 %r2819, %r1506, %r2402, %r2819;
fma.rn.f32 %r2818, %r1507, %r2401, %r2818;
fma.rn.f32 %r2817, %r1508, %r2400, %r2817;
fma.rn.f32 %r2816, %r1509, %r2399, %r2816;
fma.rn.f32 %r2815, %r1510, %r2398, %r2815;
fma.rn.f32 %r2814, %r1511, %r2397, %r2814;
fma.rn.f32 %r2813, %r1512, %r2396, %r2813;
fma.rn.f32 %r2812, %r1513, %r2395, %r2812;
fma.rn.f32 %r2811, %r1514, %r2394, %r2811;
fma.rn.f32 %r2810, %r1515, %r2393, %r2810;
fma.rn.f32 %r2809, %r1516, %r2392, %r2809;
fma.rn.f32 %r2808, %r1517, %r2391, %r2808;
fma.rn.f32 %r2807, %r1518, %r2390, %r2807;
fma.rn.f32 %r2806, %r1519, %r2389, %r2806;
fma.rn.f32 %r2805, %r1520, %r2388, %r2805;
fma.rn.f32 %r2804, %r1521, %r2387, %r2804;
fma.rn.f32 %r2803, %r1522, %r2386, %r2803;
fma.rn.f32 %r2802, %r1523, %r2385, %r2802;
fma.rn.f32 %r2801, %r1524, %r2384, %r2801;
fma.rn.f32 %r2800, %r1525, %r2383, %r2800;
fma.rn.f32 %r2799, %r1526, %r2382, %r2799;
fma.rn.f32 %r2798, %r1527, %r2381, %r2798;
fma.rn.f32 %r2797, %r1528, %r2380, %r2797;
fma.rn.f32 %r2796, %r1529, %r2379, %r2796;
fma.rn.f32 %r2795, %r1530, %r2378, %r2795;
fma.rn.f32 %r2794, %r1531, %r2377, %r2794;
fma.rn.f32 %r2793, %r1532, %r2376, %r2793;
fma.rn.f32 %r2792, %r1533, %r2375, %r2792;
fma.rn.f32 %r2791, %r1535, %r2374, %r2791;
fma.rn.f32 %r2790, %r1534, %r2373, %r2790;
fma.rn.f32 %r2845, %r1480, %r2428, %r2845;
fma.rn.f32 %r2846, %r1479, %r2429, %r2846;
fma.rn.f32 %r2847, %r1478, %r2430, %r2847;
fma.rn.f32 %r2848, %r1477, %r2431, %r2848;
fma.rn.f32 %r2849, %r1476, %r2432, %r2849;
fma.rn.f32 %r2850, %r1475, %r2433, %r2850;
fma.rn.f32 %r2851, %r1474, %r2434, %r2851;
fma.rn.f32 %r2852, %r1473, %r2435, %r2852;
fma.rn.f32 %r2853, %r1472, %r2436, %r2853;
add.f32 %r2854, %r2854, %r2373;
add.f32 %r2855, %r2855, %r2374;
add.f32 %r2856, %r2856, %r2375;
add.f32 %r2857, %r2857, %r2376;
add.f32 %r2858, %r2858, %r2377;
add.f32 %r2859, %r2859, %r2378;
add.f32 %r2860, %r2860, %r2379;
add.f32 %r2861, %r2861, %r2380;
add.f32 %r2862, %r2862, %r2381;
add.f32 %r2863, %r2863, %r2382;
add.f32 %r2864, %r2864, %r2383;
add.f32 %r2865, %r2865, %r2384;
add.f32 %r2866, %r2866, %r2385;
add.f32 %r2867, %r2867, %r2386;
add.f32 %r2868, %r2868, %r2387;
add.f32 %r2869, %r2869, %r2388;
add.f32 %r2870, %r2870, %r2389;
add.f32 %r2871, %r2871, %r2390;
add.f32 %r2872, %r2872, %r2391;
add.f32 %r2873, %r2873, %r2392;
add.f32 %r2874, %r2874, %r2393;
add.f32 %r2875, %r2875, %r2394;
add.f32 %r2876, %r2876, %r2395;
add.f32 %r2877, %r2877, %r2396;
add.f32 %r2878, %r2878, %r2397;
add.f32 %r2879, %r2879, %r2398;
add.f32 %r2880, %r2880, %r2399;
add.f32 %r2881, %r2881, %r2400;
add.f32 %r2882, %r2882, %r2401;
add.f32 %r2883, %r2883, %r2402;
add.f32 %r2884, %r2884, %r2403;
add.f32 %r2885, %r2885, %r2404;
add.f32 %r2886, %r2886, %r2405;
add.f32 %r2887, %r2887, %r2406;
add.f32 %r2888, %r2888, %r2407;
add.f32 %r2889, %r2889, %r2408;
add.f32 %r2890, %r2890, %r2409;
add.f32 %r2891, %r2891, %r2410;
add.f32 %r2892, %r2892, %r2411;
add.f32 %r2893, %r2893, %r2412;
add.f32 %r2894, %r2894, %r2413;
add.f32 %r2895, %r2895, %r2414;
add.f32 %r2896, %r2896, %r2415;
add.f32 %r2897, %r2897, %r2416;
add.f32 %r2898, %r2898, %r2417;
add.f32 %r2899, %r2899, %r2418;
add.f32 %r2900, %r2900, %r2419;
add.f32 %r2901, %r2901, %r2420;
add.f32 %r2902, %r2902, %r2421;
add.f32 %r2903, %r2903, %r2422;
add.f32 %r2904, %r2904, %r2423;
add.f32 %r2905, %r2905, %r2424;
add.f32 %r2906, %r2906, %r2425;
add.f32 %r2907, %r2907, %r2426;
add.f32 %r2908, %r2908, %r2427;
add.f32 %r2909, %r2909, %r2428;
add.f32 %r2910, %r2910, %r2429;
add.f32 %r2911, %r2911, %r2430;
add.f32 %r2912, %r2912, %r2431;
add.f32 %r2913, %r2913, %r2432;
add.f32 %r2914, %r2914, %r2433;
add.f32 %r2915, %r2915, %r2434;
add.f32 %r2916, %r2916, %r2435;
add.f32 %r2917, %r2917, %r2436;
.loc 1 133 20 // layer_norm.py:133:20
add.s64 %rd167, %rd167, 4;
.loc 1 134 20 // layer_norm.py:134:20
add.s64 %rd166, %rd166, 4;
.loc 1 115 30 // layer_norm.py:115:30
add.s64 %rd165, %rd165, %rd26;
add.s64 %rd164, %rd164, %rd29;
add.s64 %rd163, %rd163, %rd30;
add.s32 %r2661, %r2661, -1;
setp.ne.s32 %p72, %r2661, 0;
@%p72 bra $L__BB0_2;
$L__BB0_3: // %._crit_edge
.loc 1 104 18 // layer_norm.py:104:18
setp.lt.s32 %p88, %r2240, %r542;
setp.lt.s32 %p80, %r2241, %r542;
setp.lt.s32 %p73, %r2242, %r542;
.loc 1 138 37 // layer_norm.py:138:37
mul.lo.s32 %r2238, %r539, %r1;
.loc 1 138 22 // layer_norm.py:138:22
mul.wide.s32 %rd154, %r2238, 4;
add.s64 %rd155, %rd45, %rd154;
.loc 1 138 49 // layer_norm.py:138:49
shl.b64 %rd156, %rd1, 2;
add.s64 %rd122, %rd155, %rd156;
add.s64 %rd123, %rd122, 2048;
add.s64 %rd124, %rd122, 4096;
add.s64 %rd125, %rd122, 6144;
add.s64 %rd126, %rd122, 8192;
add.s64 %rd127, %rd122, 10240;
add.s64 %rd128, %rd122, 12288;
add.s64 %rd129, %rd155, %rd162;
add.s64 %rd130, %rd122, 16384;
add.s64 %rd131, %rd122, 18432;
add.s64 %rd132, %rd122, 20480;
add.s64 %rd133, %rd122, 22528;
add.s64 %rd134, %rd122, 24576;
add.s64 %rd135, %rd122, 26624;
add.s64 %rd136, %rd122, 28672;
add.s64 %rd137, %rd155, %rd161;
.loc 1 138 55 // layer_norm.py:138:55
// begin inline asm
@%p73 st.global.v4.b32 [ %rd122 + 0 ], { %r2790, %r2791, %r2792, %r2793 };
// end inline asm
// begin inline asm
@%p74 st.global.v4.b32 [ %rd123 + 0 ], { %r2794, %r2795, %r2796, %r2797 };
// end inline asm
// begin inline asm
@%p75 st.global.v4.b32 [ %rd124 + 0 ], { %r2798, %r2799, %r2800, %r2801 };
// end inline asm
// begin inline asm
@%p76 st.global.v4.b32 [ %rd125 + 0 ], { %r2802, %r2803, %r2804, %r2805 };
// end inline asm
// begin inline asm
@%p77 st.global.v4.b32 [ %rd126 + 0 ], { %r2806, %r2807, %r2808, %r2809 };
// end inline asm
// begin inline asm
@%p78 st.global.v4.b32 [ %rd127 + 0 ], { %r2810, %r2811, %r2812, %r2813 };
// end inline asm
// begin inline asm
@%p79 st.global.v4.b32 [ %rd128 + 0 ], { %r2814, %r2815, %r2816, %r2817 };
// end inline asm
// begin inline asm
@%p80 st.global.v4.b32 [ %rd129 + 0 ], { %r2818, %r2819, %r2820, %r2821 };
// end inline asm
// begin inline asm
@%p81 st.global.v4.b32 [ %rd130 + 0 ], { %r2822, %r2823, %r2824, %r2825 };
// end inline asm
// begin inline asm
@%p82 st.global.v4.b32 [ %rd131 + 0 ], { %r2826, %r2827, %r2828, %r2829 };
// end inline asm
// begin inline asm
@%p83 st.global.v4.b32 [ %rd132 + 0 ], { %r2830, %r2831, %r2832, %r2833 };
// end inline asm
// begin inline asm
@%p84 st.global.v4.b32 [ %rd133 + 0 ], { %r2834, %r2835, %r2836, %r2837 };
// end inline asm
// begin inline asm
@%p85 st.global.v4.b32 [ %rd134 + 0 ], { %r2838, %r2839, %r2840, %r2841 };
// end inline asm
// begin inline asm
@%p86 st.global.v4.b32 [ %rd135 + 0 ], { %r2842, %r2843, %r2844, %r2845 };
// end inline asm
// begin inline asm
@%p87 st.global.v4.b32 [ %rd136 + 0 ], { %r2846, %r2847, %r2848, %r2849 };
// end inline asm
// begin inline asm
@%p88 st.global.v4.b32 [ %rd137 + 0 ], { %r2850, %r2851, %r2852, %r2853 };
// end inline asm
.loc 1 139 37 // layer_norm.py:139:37
mul.lo.s32 %r2239, %r540, %r1;
.loc 1 139 22 // layer_norm.py:139:22
mul.wide.s32 %rd159, %r2239, 4;
add.s64 %rd160, %rd46, %rd159;
.loc 1 139 49 // layer_norm.py:139:49
add.s64 %rd138, %rd160, %rd156;
add.s64 %rd139, %rd138, 2048;
add.s64 %rd140, %rd138, 4096;
add.s64 %rd141, %rd138, 6144;
add.s64 %rd142, %rd138, 8192;
add.s64 %rd143, %rd138, 10240;
add.s64 %rd144, %rd138, 12288;
add.s64 %rd145, %rd160, %rd162;
add.s64 %rd146, %rd138, 16384;
add.s64 %rd147, %rd138, 18432;
add.s64 %rd148, %rd138, 20480;
add.s64 %rd149, %rd138, 22528;
add.s64 %rd150, %rd138, 24576;
add.s64 %rd151, %rd138, 26624;
add.s64 %rd152, %rd138, 28672;
add.s64 %rd153, %rd160, %rd161;
.loc 1 139 55 // layer_norm.py:139:55
// begin inline asm
@%p73 st.global.v4.b32 [ %rd138 + 0 ], { %r2854, %r2855, %r2856, %r2857 };
// end inline asm
// begin inline asm
@%p74 st.global.v4.b32 [ %rd139 + 0 ], { %r2858, %r2859, %r2860, %r2861 };
// end inline asm
// begin inline asm
@%p75 st.global.v4.b32 [ %rd140 + 0 ], { %r2862, %r2863, %r2864, %r2865 };
// end inline asm
// begin inline asm
@%p76 st.global.v4.b32 [ %rd141 + 0 ], { %r2866, %r2867, %r2868, %r2869 };
// end inline asm
// begin inline asm
@%p77 st.global.v4.b32 [ %rd142 + 0 ], { %r2870, %r2871, %r2872, %r2873 };
// end inline asm
// begin inline asm
@%p78 st.global.v4.b32 [ %rd143 + 0 ], { %r2874, %r2875, %r2876, %r2877 };
// end inline asm
// begin inline asm
@%p79 st.global.v4.b32 [ %rd144 + 0 ], { %r2878, %r2879, %r2880, %r2881 };
// end inline asm
// begin inline asm
@%p80 st.global.v4.b32 [ %rd145 + 0 ], { %r2882, %r2883, %r2884, %r2885 };
// end inline asm
// begin inline asm
@%p81 st.global.v4.b32 [ %rd146 + 0 ], { %r2886, %r2887, %r2888, %r2889 };
// end inline asm
// begin inline asm
@%p82 st.global.v4.b32 [ %rd147 + 0 ], { %r2890, %r2891, %r2892, %r2893 };
// end inline asm
// begin inline asm
@%p83 st.global.v4.b32 [ %rd148 + 0 ], { %r2894, %r2895, %r2896, %r2897 };
// end inline asm
// begin inline asm
@%p84 st.global.v4.b32 [ %rd149 + 0 ], { %r2898, %r2899, %r2900, %r2901 };
// end inline asm
// begin inline asm
@%p85 st.global.v4.b32 [ %rd150 + 0 ], { %r2902, %r2903, %r2904, %r2905 };
// end inline asm
// begin inline asm
@%p86 st.global.v4.b32 [ %rd151 + 0 ], { %r2906, %r2907, %r2908, %r2909 };
// end inline asm
// begin inline asm
@%p87 st.global.v4.b32 [ %rd152 + 0 ], { %r2910, %r2911, %r2912, %r2913 };
// end inline asm
// begin inline asm
@%p88 st.global.v4.b32 [ %rd153 + 0 ], { %r2914, %r2915, %r2916, %r2917 };
// end inline asm
.loc 1 139 4 // layer_norm.py:139:4
ret;
$L__tmp7:
$L__func_end0:
// -- End function
}
.file 1 "/home/dberard/local/pytorch-env7/Liger-Kernel/src/liger_kernel/ops/layer_norm.py"
.file 2 "/home/dberard/local/pytorch-env7/triton/python/triton/language/standard.py"
.section .debug_abbrev
{
.b8 1 // Abbreviation Code
.b8 17 // DW_TAG_compile_unit
.b8 1 // DW_CHILDREN_yes
.b8 37 // DW_AT_producer
.b8 8 // DW_FORM_string
.b8 19 // DW_AT_language
.b8 5 // DW_FORM_data2
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 16 // DW_AT_stmt_list
.b8 6 // DW_FORM_data4
.b8 27 // DW_AT_comp_dir
.b8 8 // DW_FORM_string
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 2 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 0 // DW_CHILDREN_no
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 32 // DW_AT_inline
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 3 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 1 // DW_CHILDREN_yes
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 4 // Abbreviation Code
.b8 29 // DW_TAG_inlined_subroutine
.b8 0 // DW_CHILDREN_no
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 88 // DW_AT_call_file
.b8 11 // DW_FORM_data1
.b8 89 // DW_AT_call_line
.b8 11 // DW_FORM_data1
.b8 87 // DW_AT_call_column
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(3)
}
.section .debug_info
{
.b32 203 // Length of Unit
.b8 2 // DWARF version number
.b8 0
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b8 8 // Address Size (in bytes)
.b8 1 // Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit
.b8 116 // DW_AT_producer
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2 // DW_AT_language
.b8 0
.b8 108 // DW_AT_name
.b8 97
.b8 121
.b8 101
.b8 114
.b8 95
.b8 110
.b8 111
.b8 114
.b8 109
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line // DW_AT_stmt_list
.b8 47 // DW_AT_comp_dir
.b8 104
.b8 111
.b8 109
.b8 101
.b8 47
.b8 100
.b8 98
.b8 101
.b8 114
.b8 97
.b8 114
.b8 100
.b8 47
.b8 108
.b8 111
.b8 99
.b8 97
.b8 108
.b8 47
.b8 112
.b8 121
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 45
.b8 101
.b8 110
.b8 118
.b8 55
.b8 47
.b8 76
.b8 105
.b8 103
.b8 101
.b8 114
.b8 45
.b8 75
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 47
.b8 115
.b8 114
.b8 99
.b8 47
.b8 108
.b8 105
.b8 103
.b8 101
.b8 114
.b8 95
.b8 107
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 47
.b8 111
.b8 112
.b8 115
.b8 0
.b8 2 // Abbrev [2] 0x6a:0x1e DW_TAG_subprogram
.b8 95 // DW_AT_name
.b8 108
.b8 97
.b8 121
.b8 101
.b8 114
.b8 95
.b8 110
.b8 111
.b8 114
.b8 109
.b8 95
.b8 98
.b8 97
.b8 99
.b8 107
.b8 119
.b8 97
.b8 114
.b8 100
.b8 95
.b8 107
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 0
.b8 1 // DW_AT_inline
.b8 3 // Abbrev [3] 0x88:0x46 DW_TAG_subprogram
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b32 106 // DW_AT_abstract_origin
.b8 4 // Abbrev [4] 0x9d:0x18 DW_TAG_inlined_subroutine
.b32 106 // DW_AT_abstract_origin
.b64 $L__tmp1 // DW_AT_low_pc
.b64 $L__tmp4 // DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 124 // DW_AT_call_line
.b8 20 // DW_AT_call_column
.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine
.b32 106 // DW_AT_abstract_origin
.b64 $L__tmp5 // DW_AT_low_pc
.b64 $L__tmp6 // DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 125 // DW_AT_call_line
.b8 20 // DW_AT_call_column
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
}
.section .debug_macinfo { }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment