Skip to content

Instantly share code, notes, and snippets.

@Groovounet
Created February 14, 2018 17:04
Show Gist options
  • Save Groovounet/01aa31a61ea81f9f3009a9e5ec7cddb1 to your computer and use it in GitHub Desktop.
Save Groovounet/01aa31a61ea81f9f3009a9e5ec7cddb1 to your computer and use it in GitHub Desktop.
Independent texture fetches with ASM
#version 430 core
#define FETCH_COUNT 16
#define FRAG_COLOR 0
#define NORMALIZE_COORD float(1.0 / 2048.0)
layout(binding = 0) uniform sampler2D Texture[FETCH_COUNT];
in vec4 gl_FragCoord;
layout(location = FRAG_COLOR, index = 0) out vec4 Color;
void main()
{
vec2 Coord = gl_FragCoord.xy * NORMALIZE_COORD;
vec2 Temp = vec2(0);
for(int i = 0; i < FETCH_COUNT; ++i)
Temp = texture(Texture[i], Coord).xy + Temp;
Color = vec4(Temp * (1.0 / float(FETCH_COUNT)), 0.0, 1.0);
}
1 | 3 | :: : | label_basic_block_1: s_mov_b64 s[56:57], exec
2 | 3 | :: : | s_wqm_b64 exec, exec
3 | 3 | :: : | s_mov_b32 s0, s1
4 | 3 | :: : | s_movk_i32 s1, 0x0000
5 | 3 | :: : | s_movk_i32 s3, 0x0000
6 | 3 | :: : | s_load_dwordx8 s[8:15], s[0:1], 0x00
7 | 3 | :: : | s_load_dwordx8 s[16:23], s[2:3], 0x00
8 | 3 | :: : | s_load_dwordx8 s[24:31], s[0:1], 0x20
9 | 3 | :: : | s_load_dwordx8 s[32:39], s[0:1], 0x40
10 | 3 | :: : | s_load_dwordx8 s[40:47], s[2:3], 0x20
11 | 3 | :: : | s_load_dwordx8 s[48:55], s[0:1], 0x60
12 | 3 | :: : | s_andn2_b32 s5, s5, 0x3fff0000
13 | 4 | ^ :: : | v_mov_b32 v0, 0
14 | 5 | :^:: : | v_mov_b32 v1, 1.0
15 | 5 | :::: : | s_buffer_load_dwordx4 s[4:7], s[4:7], 0x10
16 | 5 | :::: : | s_waitcnt lgkmcnt(0)
17 | 5 | ::x: : | v_add_f32 v2, s4, v2
18 | 6 | ::::^: | v_mov_b32 v4, s5
19 | 6 | :::xv: | v_mad_legacy_f32 v3, v3, s6, v4
20 | 6 | ::x::: | v_mul_f32 v2, 0x3a000000, v2
21 | 6 | :::x:: | v_mul_f32 v3, 0x3a000000, v3
22 | 6 | ::vvxx | image_sample v[4:5], v[2:5], s[8:15], s[16:19]
23 | 6 | :::::: | s_nop 0x0000
24 | 8 | ::vvvv^^ | image_sample v[6:7], v[2:5], s[24:31], s[20:23]
25 | 8 | :::::::: | s_nop 0x0000
26 | 10 | ::vvvv::^^ | image_sample v[8:9], v[2:5], s[32:39], s[40:43]
27 | 10 | :::::::::: | s_nop 0x0000
28 | 12 | ::vvvv::::^^ | image_sample v[10:11], v[2:5], s[48:55], s[44:47]
29 | 12 | :::::::::::: | s_load_dwordx8 s[4:11], s[0:1], 0x80
30 | 12 | :::::::::::: | s_load_dwordx8 s[12:19], s[2:3], 0x40
31 | 12 | :::::::::::: | s_load_dwordx8 s[20:27], s[0:1], 0xa0
32 | 12 | :::::::::::: | s_load_dwordx8 s[28:35], s[0:1], 0xc0
33 | 12 | :::::::::::: | s_load_dwordx8 s[36:43], s[2:3], 0x60
34 | 12 | :::::::::::: | s_load_dwordx8 s[44:51], s[0:1], 0xe0
35 | 12 | :::::::::::: | s_waitcnt lgkmcnt(0)
36 | 14 | ::vvvv::::::^^ | image_sample v[12:13], v[2:5], s[4:11], s[12:15]
37 | 14 | :::::::::::::: | s_nop 0x0000
38 | 16 | ::vvvv::::::::^^ | image_sample v[14:15], v[2:5], s[20:27], s[16:19]
39 | 16 | :::::::::::::::: | s_nop 0x0000
40 | 18 | ::vvvv::::::::::^^ | image_sample v[16:17], v[2:5], s[28:35], s[36:39]
41 | 18 | :::::::::::::::::: | s_nop 0x0000
42 | 20 | ::vvvv::::::::::::^^ | image_sample v[18:19], v[2:5], s[44:51], s[40:43]
43 | 20 | :::::::::::::::::::: | s_load_dwordx8 s[4:11], s[0:1], 0x100
44 | 20 | :::::::::::::::::::: | s_load_dwordx8 s[12:19], s[2:3], 0x80
45 | 20 | :::::::::::::::::::: | s_load_dwordx8 s[20:27], s[0:1], 0x120
46 | 20 | :::::::::::::::::::: | s_load_dwordx8 s[28:35], s[0:1], 0x140
47 | 20 | :::::::::::::::::::: | s_load_dwordx8 s[36:43], s[2:3], 0xa0
48 | 20 | :::::::::::::::::::: | s_load_dwordx8 s[44:51], s[0:1], 0x160
49 | 20 | :::::::::::::::::::: | s_waitcnt lgkmcnt(0)
50 | 22 | ::vvvv::::::::::::::^^ | image_sample v[20:21], v[2:5], s[4:11], s[12:15]
51 | 22 | :::::::::::::::::::::: | s_nop 0x0000
52 | 24 | ::vvvv::::::::::::::::^^ | image_sample v[22:23], v[2:5], s[20:27], s[16:19]
53 | 24 | :::::::::::::::::::::::: | s_nop 0x0000
54 | 26 | ::vvvv::::::::::::::::::^^ | image_sample v[24:25], v[2:5], s[28:35], s[36:39]
55 | 26 | :::::::::::::::::::::::::: | s_nop 0x0000
56 | 28 | ::vvvv::::::::::::::::::::^^ | image_sample v[26:27], v[2:5], s[44:51], s[40:43]
57 | 28 | :::::::::::::::::::::::::::: | s_load_dwordx8 s[4:11], s[0:1], 0x180
58 | 28 | :::::::::::::::::::::::::::: | s_load_dwordx8 s[12:19], s[2:3], 0xc0
59 | 28 | :::::::::::::::::::::::::::: | s_load_dwordx8 s[20:27], s[0:1], 0x1a0
60 | 28 | :::::::::::::::::::::::::::: | s_load_dwordx8 s[28:35], s[0:1], 0x1c0
61 | 28 | :::::::::::::::::::::::::::: | s_load_dwordx8 s[36:43], s[2:3], 0xe0
62 | 28 | :::::::::::::::::::::::::::: | s_load_dwordx8 s[44:51], s[0:1], 0x1e0
63 | 28 | :::::::::::::::::::::::::::: | s_waitcnt lgkmcnt(0)
64 | 30 | ::vvvv::::::::::::::::::::::^^ | image_sample v[28:29], v[2:5], s[4:11], s[12:15]
65 | 30 | :::::::::::::::::::::::::::::: | s_nop 0x0000
66 | 32 | ::vvvv::::::::::::::::::::::::^^ | image_sample v[30:31], v[2:5], s[20:27], s[16:19]
67 | 32 | :::::::::::::::::::::::::::::::: | s_nop 0x0000
68 | 34 | ::vvvv::::::::::::::::::::::::::^^ | image_sample v[32:33], v[2:5], s[28:35], s[36:39]
69 | 34 | :::::::::::::::::::::::::::::::::: | s_nop 0x0000
70 | 34 | ::xxvv:::::::::::::::::::::::::::: | image_sample v[2:3], v[2:5], s[44:51], s[40:43]
71 | 34 | :::::::::::::::::::::::::::::::::: | s_waitcnt vmcnt(14)
72 | 34 | ::::x:v::::::::::::::::::::::::::: | v_add_f32 v4, v4, v6
73 | 33 | :::::x v:::::::::::::::::::::::::: | v_add_f32 v5, v5, v7
74 | 32 | :::::: :::::::::::::::::::::::::: | s_waitcnt vmcnt(13)
75 | 32 | ::::x: v::::::::::::::::::::::::: | v_add_f32 v4, v4, v8
76 | 31 | :::::x v:::::::::::::::::::::::: | v_add_f32 v5, v5, v9
77 | 30 | :::::: :::::::::::::::::::::::: | s_waitcnt vmcnt(12)
78 | 30 | ::::x: v::::::::::::::::::::::: | v_add_f32 v4, v4, v10
79 | 29 | :::::x v:::::::::::::::::::::: | v_add_f32 v5, v5, v11
80 | 28 | :::::: :::::::::::::::::::::: | s_waitcnt vmcnt(11)
81 | 28 | ::::x: v::::::::::::::::::::: | v_add_f32 v4, v4, v12
82 | 27 | :::::x v:::::::::::::::::::: | v_add_f32 v5, v5, v13
83 | 26 | :::::: :::::::::::::::::::: | s_waitcnt vmcnt(10)
84 | 26 | ::::x: v::::::::::::::::::: | v_add_f32 v4, v4, v14
85 | 25 | :::::x v:::::::::::::::::: | v_add_f32 v5, v5, v15
86 | 24 | :::::: :::::::::::::::::: | s_waitcnt vmcnt(9)
87 | 24 | ::::x: v::::::::::::::::: | v_add_f32 v4, v4, v16
88 | 23 | :::::x v:::::::::::::::: | v_add_f32 v5, v5, v17
89 | 22 | :::::: :::::::::::::::: | s_waitcnt vmcnt(8)
90 | 22 | ::::x: v::::::::::::::: | v_add_f32 v4, v4, v18
91 | 21 | :::::x v:::::::::::::: | v_add_f32 v5, v5, v19
92 | 20 | :::::: :::::::::::::: | s_waitcnt vmcnt(7)
93 | 20 | ::::x: v::::::::::::: | v_add_f32 v4, v4, v20
94 | 19 | :::::x v:::::::::::: | v_add_f32 v5, v5, v21
95 | 18 | :::::: :::::::::::: | s_waitcnt vmcnt(6)
96 | 18 | ::::x: v::::::::::: | v_add_f32 v4, v4, v22
97 | 17 | :::::x v:::::::::: | v_add_f32 v5, v5, v23
98 | 16 | :::::: :::::::::: | s_waitcnt vmcnt(5)
99 | 16 | ::::x: v::::::::: | v_add_f32 v4, v4, v24
100 | 15 | :::::x v:::::::: | v_add_f32 v5, v5, v25
101 | 14 | :::::: :::::::: | s_waitcnt vmcnt(4)
102 | 14 | ::::x: v::::::: | v_add_f32 v4, v4, v26
103 | 13 | :::::x v:::::: | v_add_f32 v5, v5, v27
104 | 12 | :::::: :::::: | s_waitcnt vmcnt(3)
105 | 12 | ::::x: v::::: | v_add_f32 v4, v4, v28
106 | 11 | :::::x v:::: | v_add_f32 v5, v5, v29
107 | 10 | :::::: :::: | s_waitcnt vmcnt(2)
108 | 10 | ::::x: v::: | v_add_f32 v4, v4, v30
109 | 9 | :::::x v:: | v_add_f32 v5, v5, v31
110 | 8 | :::::: :: | s_waitcnt vmcnt(1)
111 | 8 | ::::x: v: | v_add_f32 v4, v4, v32
112 | 7 | :::::x v | v_add_f32 v5, v5, v33
113 | 6 | :::::: | s_waitcnt vmcnt(0)
114 | 6 | ::x:v: | v_add_f32 v2, v4, v2
115 | 5 | :::x v | v_add_f32 v3, v5, v3
116 | 4 | ::x: | v_mul_f32 v2, 0x3d800000, v2
117 | 4 | :::x | v_mul_f32 v3, 0x3d800000, v3
118 | 4 | :::: | s_mov_b64 exec, s[56:57]
119 | 4 | ::xv | v_cvt_pkrtz_f16_f32 v2, v2, v3
120 | 3 | xv: | v_cvt_pkrtz_f16_f32 v0, v0, v1
121 | 2 | v v | exp mrt0, v2, v2, v0, v0
122 | 0 | | s_endpgm
Maximum # VGPR used 34, # VGPR allocated: 34
shader main
asic(VI)
type(PS)
// s_ps_state in s0
s_mov_b64 s[56:57], exec // 000000000000: BEB8017E
s_wqm_b64 exec, exec // 000000000004: BEFE077E
s_mov_b32 s0, s1 // 000000000008: BE800001
s_movk_i32 s1, 0x0000 // 00000000000C: B0010000
s_movk_i32 s3, 0x0000 // 000000000010: B0030000
s_load_dwordx8 s[8:15], s[0:1], 0x00 // 000000000014: C00E0200 00000000
s_load_dwordx8 s[16:23], s[2:3], 0x00 // 00000000001C: C00E0401 00000000
s_load_dwordx8 s[24:31], s[0:1], 0x20 // 000000000024: C00E0600 00000020
s_load_dwordx8 s[32:39], s[0:1], 0x40 // 00000000002C: C00E0800 00000040
s_load_dwordx8 s[40:47], s[2:3], 0x20 // 000000000034: C00E0A01 00000020
s_load_dwordx8 s[48:55], s[0:1], 0x60 // 00000000003C: C00E0C00 00000060
s_andn2_b32 s5, s5, 0x3fff0000 // 000000000044: 8905FF05 3FFF0000
v_mov_b32 v0, 0 // 00000000004C: 7E000280
v_mov_b32 v1, 1.0 // 000000000050: 7E0202F2
s_buffer_load_dwordx4 s[4:7], s[4:7], 0x10 // 000000000054: C02A0102 00000010
s_waitcnt lgkmcnt(0) // 00000000005C: BF8C007F
v_add_f32 v2, s4, v2 // 000000000060: 02040404
v_mov_b32 v4, s5 // 000000000064: 7E080205
v_mad_legacy_f32 v3, v3, s6, v4 // 000000000068: D1C00003 04100D03
v_mul_f32 v2, 0x3a000000, v2 // 000000000070: 0A0404FF 3A000000
v_mul_f32 v3, 0x3a000000, v3 // 000000000078: 0A0606FF 3A000000
image_sample v[4:5], v[2:5], s[8:15], s[16:19] dmask:0x3 // 000000000080: F0800300 00820402
s_nop 0x0000 // 000000000088: BF800000
image_sample v[6:7], v[2:5], s[24:31], s[20:23] dmask:0x3 // 00000000008C: F0800300 00A60602
s_nop 0x0000 // 000000000094: BF800000
image_sample v[8:9], v[2:5], s[32:39], s[40:43] dmask:0x3 // 000000000098: F0800300 01480802
s_nop 0x0000 // 0000000000A0: BF800000
image_sample v[10:11], v[2:5], s[48:55], s[44:47] dmask:0x3 // 0000000000A4: F0800300 016C0A02
s_load_dwordx8 s[4:11], s[0:1], 0x80 // 0000000000AC: C00E0100 00000080
s_load_dwordx8 s[12:19], s[2:3], 0x40 // 0000000000B4: C00E0301 00000040
s_load_dwordx8 s[20:27], s[0:1], 0xa0 // 0000000000BC: C00E0500 000000A0
s_load_dwordx8 s[28:35], s[0:1], 0xc0 // 0000000000C4: C00E0700 000000C0
s_load_dwordx8 s[36:43], s[2:3], 0x60 // 0000000000CC: C00E0901 00000060
s_load_dwordx8 s[44:51], s[0:1], 0xe0 // 0000000000D4: C00E0B00 000000E0
s_waitcnt lgkmcnt(0) // 0000000000DC: BF8C007F
image_sample v[12:13], v[2:5], s[4:11], s[12:15] dmask:0x3 // 0000000000E0: F0800300 00610C02
s_nop 0x0000 // 0000000000E8: BF800000
image_sample v[14:15], v[2:5], s[20:27], s[16:19] dmask:0x3 // 0000000000EC: F0800300 00850E02
s_nop 0x0000 // 0000000000F4: BF800000
image_sample v[16:17], v[2:5], s[28:35], s[36:39] dmask:0x3 // 0000000000F8: F0800300 01271002
s_nop 0x0000 // 000000000100: BF800000
image_sample v[18:19], v[2:5], s[44:51], s[40:43] dmask:0x3 // 000000000104: F0800300 014B1202
s_load_dwordx8 s[4:11], s[0:1], 0x100 // 00000000010C: C00E0100 00000100
s_load_dwordx8 s[12:19], s[2:3], 0x80 // 000000000114: C00E0301 00000080
s_load_dwordx8 s[20:27], s[0:1], 0x120 // 00000000011C: C00E0500 00000120
s_load_dwordx8 s[28:35], s[0:1], 0x140 // 000000000124: C00E0700 00000140
s_load_dwordx8 s[36:43], s[2:3], 0xa0 // 00000000012C: C00E0901 000000A0
s_load_dwordx8 s[44:51], s[0:1], 0x160 // 000000000134: C00E0B00 00000160
s_waitcnt lgkmcnt(0) // 00000000013C: BF8C007F
image_sample v[20:21], v[2:5], s[4:11], s[12:15] dmask:0x3 // 000000000140: F0800300 00611402
s_nop 0x0000 // 000000000148: BF800000
image_sample v[22:23], v[2:5], s[20:27], s[16:19] dmask:0x3 // 00000000014C: F0800300 00851602
s_nop 0x0000 // 000000000154: BF800000
image_sample v[24:25], v[2:5], s[28:35], s[36:39] dmask:0x3 // 000000000158: F0800300 01271802
s_nop 0x0000 // 000000000160: BF800000
image_sample v[26:27], v[2:5], s[44:51], s[40:43] dmask:0x3 // 000000000164: F0800300 014B1A02
s_load_dwordx8 s[4:11], s[0:1], 0x180 // 00000000016C: C00E0100 00000180
s_load_dwordx8 s[12:19], s[2:3], 0xc0 // 000000000174: C00E0301 000000C0
s_load_dwordx8 s[20:27], s[0:1], 0x1a0 // 00000000017C: C00E0500 000001A0
s_load_dwordx8 s[28:35], s[0:1], 0x1c0 // 000000000184: C00E0700 000001C0
s_load_dwordx8 s[36:43], s[2:3], 0xe0 // 00000000018C: C00E0901 000000E0
s_load_dwordx8 s[44:51], s[0:1], 0x1e0 // 000000000194: C00E0B00 000001E0
s_waitcnt lgkmcnt(0) // 00000000019C: BF8C007F
image_sample v[28:29], v[2:5], s[4:11], s[12:15] dmask:0x3 // 0000000001A0: F0800300 00611C02
s_nop 0x0000 // 0000000001A8: BF800000
image_sample v[30:31], v[2:5], s[20:27], s[16:19] dmask:0x3 // 0000000001AC: F0800300 00851E02
s_nop 0x0000 // 0000000001B4: BF800000
image_sample v[32:33], v[2:5], s[28:35], s[36:39] dmask:0x3 // 0000000001B8: F0800300 01272002
s_nop 0x0000 // 0000000001C0: BF800000
image_sample v[2:3], v[2:5], s[44:51], s[40:43] dmask:0x3 // 0000000001C4: F0800300 014B0202
s_waitcnt vmcnt(14) // 0000000001CC: BF8C0F7E
v_add_f32 v4, v4, v6 // 0000000001D0: 02080D04
v_add_f32 v5, v5, v7 // 0000000001D4: 020A0F05
s_waitcnt vmcnt(13) // 0000000001D8: BF8C0F7D
v_add_f32 v4, v4, v8 // 0000000001DC: 02081104
v_add_f32 v5, v5, v9 // 0000000001E0: 020A1305
s_waitcnt vmcnt(12) // 0000000001E4: BF8C0F7C
v_add_f32 v4, v4, v10 // 0000000001E8: 02081504
v_add_f32 v5, v5, v11 // 0000000001EC: 020A1705
s_waitcnt vmcnt(11) // 0000000001F0: BF8C0F7B
v_add_f32 v4, v4, v12 // 0000000001F4: 02081904
v_add_f32 v5, v5, v13 // 0000000001F8: 020A1B05
s_waitcnt vmcnt(10) // 0000000001FC: BF8C0F7A
v_add_f32 v4, v4, v14 // 000000000200: 02081D04
v_add_f32 v5, v5, v15 // 000000000204: 020A1F05
s_waitcnt vmcnt(9) // 000000000208: BF8C0F79
v_add_f32 v4, v4, v16 // 00000000020C: 02082104
v_add_f32 v5, v5, v17 // 000000000210: 020A2305
s_waitcnt vmcnt(8) // 000000000214: BF8C0F78
v_add_f32 v4, v4, v18 // 000000000218: 02082504
v_add_f32 v5, v5, v19 // 00000000021C: 020A2705
s_waitcnt vmcnt(7) // 000000000220: BF8C0F77
v_add_f32 v4, v4, v20 // 000000000224: 02082904
v_add_f32 v5, v5, v21 // 000000000228: 020A2B05
s_waitcnt vmcnt(6) // 00000000022C: BF8C0F76
v_add_f32 v4, v4, v22 // 000000000230: 02082D04
v_add_f32 v5, v5, v23 // 000000000234: 020A2F05
s_waitcnt vmcnt(5) // 000000000238: BF8C0F75
v_add_f32 v4, v4, v24 // 00000000023C: 02083104
v_add_f32 v5, v5, v25 // 000000000240: 020A3305
s_waitcnt vmcnt(4) // 000000000244: BF8C0F74
v_add_f32 v4, v4, v26 // 000000000248: 02083504
v_add_f32 v5, v5, v27 // 00000000024C: 020A3705
s_waitcnt vmcnt(3) // 000000000250: BF8C0F73
v_add_f32 v4, v4, v28 // 000000000254: 02083904
v_add_f32 v5, v5, v29 // 000000000258: 020A3B05
s_waitcnt vmcnt(2) // 00000000025C: BF8C0F72
v_add_f32 v4, v4, v30 // 000000000260: 02083D04
v_add_f32 v5, v5, v31 // 000000000264: 020A3F05
s_waitcnt vmcnt(1) // 000000000268: BF8C0F71
v_add_f32 v4, v4, v32 // 00000000026C: 02084104
v_add_f32 v5, v5, v33 // 000000000270: 020A4305
s_waitcnt vmcnt(0) // 000000000274: BF8C0F70
v_add_f32 v2, v4, v2 // 000000000278: 02040504
v_add_f32 v3, v5, v3 // 00000000027C: 02060705
v_mul_f32 v2, 0x3d800000, v2 // 000000000280: 0A0404FF 3D800000
v_mul_f32 v3, 0x3d800000, v3 // 000000000288: 0A0606FF 3D800000
s_mov_b64 exec, s[56:57] // 000000000290: BEFE0138
v_cvt_pkrtz_f16_f32 v2, v2, v3 // 000000000294: D2960002 00020702
v_cvt_pkrtz_f16_f32 v0, v0, v1 // 00000000029C: D2960000 00020300
exp mrt0, v2, v2, v0, v0 done compr vm // 0000000002A4: C4001C0F 00000002
s_endpgm // 0000000002AC: BF810000
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment