Created
February 3, 2017 16:10
-
-
Save vedranmiletic/484d12f1170909ffbbf9d9c82ebb3f55 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
:-) GROMACS - gmx mdrun, 2016.2-dev-20170105-4feb0be (-: | |
GROMACS is written by: | |
Emile Apol Rossen Apostolov Herman J.C. Berendsen Par Bjelkmar | |
Aldert van Buuren Rudi van Drunen Anton Feenstra Gerrit Groenhof | |
Christoph Junghans Anca Hamuraru Vincent Hindriksen Dimitrios Karkoulis | |
Peter Kasson Jiri Kraus Carsten Kutzner Per Larsson | |
Justin A. Lemkul Magnus Lundborg Pieter Meulenhoff Erik Marklund | |
Teemu Murtola Szilard Pall Sander Pronk Roland Schulz | |
Alexey Shvetsov Michael Shirts Alfons Sijbers Peter Tieleman | |
Teemu Virolainen Christian Wennberg Maarten Wolf | |
and the project leaders: | |
Mark Abraham, Berk Hess, Erik Lindahl, and David van der Spoel | |
Copyright (c) 1991-2000, University of Groningen, The Netherlands. | |
Copyright (c) 2001-2015, The GROMACS development team at | |
Uppsala University, Stockholm University and | |
the Royal Institute of Technology, Sweden. | |
check out http://www.gromacs.org for more information. | |
GROMACS is free software; you can redistribute it and/or modify it | |
under the terms of the GNU Lesser General Public License | |
as published by the Free Software Foundation; either version 2.1 | |
of the License, or (at your option) any later version. | |
GROMACS: gmx mdrun, version 2016.2-dev-20170105-4feb0be | |
Executable: /usr/local/gromacs/bin/gmx | |
Data prefix: /usr/local/gromacs | |
Working dir: /home/vedranm/0001.5 | |
Command line: | |
gmx mdrun -v | |
Back Off! I just backed up md.log to ./#md.log.93# | |
Running on 1 node with total 4 cores, 8 logical cores, 1 compatible GPU | |
Hardware detected: | |
CPU info: | |
Vendor: Intel | |
Brand: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz | |
SIMD instructions most likely to fit this hardware: AVX_256 | |
SIMD instructions selected at GROMACS compile time: AVX_256 | |
Hardware topology: Basic | |
GPU info: | |
Number of GPUs detected: 1 | |
#0: name: AMD HAWAII (DRM 2.48.0 / 4.9.6-200.fc25.x86_64, LLVM 4.0.0), vendor: AMD, device version: OpenCL 1.1 Mesa 17.1.0-devel (git-af303ab), stat: compatible | |
Reading file topol.tpr, VERSION 2016.2-dev-20161115-0f1ce5d (single precision) | |
Changing nstlist from 10 to 40, rlist from 1 to 1.101 | |
Using 1 MPI thread | |
Using 8 OpenMP threads | |
1 compatible GPU is present, with ID 0 | |
1 GPU auto-selected for this run. | |
Mapping of GPU ID to the 1 PP rank in this node: 0 | |
.text | |
.section .AMDGPU.config | |
.long 47176 | |
.long 11272192 | |
.long 47180 | |
.long 140 | |
.long 47200 | |
.long 0 | |
.long 4 | |
.long 0 | |
.long 8 | |
.long 0 | |
.text | |
.globl test | |
.p2align 8 | |
.type test,@function | |
.amdgpu_hsa_kernel test | |
test: ; @test | |
.amd_kernel_code_t | |
amd_code_version_major = 1 | |
amd_code_version_minor = 0 | |
amd_machine_kind = 1 | |
amd_machine_version_major = 7 | |
amd_machine_version_minor = 0 | |
amd_machine_version_stepping = 1 | |
kernel_code_entry_byte_offset = 256 | |
kernel_code_prefetch_byte_size = 0 | |
max_scratch_backing_memory_byte_size = 0 | |
granulated_workitem_vgpr_count = 0 | |
granulated_wavefront_sgpr_count = 0 | |
priority = 0 | |
float_mode = 192 | |
priv = 0 | |
enable_dx10_clamp = 1 | |
debug_mode = 0 | |
enable_ieee_mode = 1 | |
enable_sgpr_private_segment_wave_byte_offset = 0 | |
user_sgpr_count = 6 | |
enable_sgpr_workgroup_id_x = 1 | |
enable_sgpr_workgroup_id_y = 0 | |
enable_sgpr_workgroup_id_z = 0 | |
enable_sgpr_workgroup_info = 0 | |
enable_vgpr_workitem_id = 0 | |
enable_exception_msb = 0 | |
granulated_lds_size = 0 | |
enable_exception = 0 | |
enable_sgpr_private_segment_buffer = 1 | |
enable_sgpr_dispatch_ptr = 0 | |
enable_sgpr_queue_ptr = 0 | |
enable_sgpr_kernarg_segment_ptr = 1 | |
enable_sgpr_dispatch_id = 0 | |
enable_sgpr_flat_scratch_init = 0 | |
enable_sgpr_private_segment_size = 0 | |
enable_sgpr_grid_workgroup_count_x = 0 | |
enable_sgpr_grid_workgroup_count_y = 0 | |
enable_sgpr_grid_workgroup_count_z = 0 | |
enable_ordered_append_gds = 0 | |
private_element_size = 1 | |
is_ptr64 = 1 | |
is_dynamic_callstack = 0 | |
is_debug_enabled = 0 | |
is_xnack_enabled = 0 | |
workitem_private_segment_byte_size = 0 | |
workgroup_group_segment_byte_size = 0 | |
gds_segment_byte_size = 0 | |
kernarg_segment_byte_size = 24 | |
workgroup_fbarrier_count = 0 | |
wavefront_sgpr_count = 6 | |
workitem_vgpr_count = 4 | |
reserved_vgpr_first = 0 | |
reserved_vgpr_count = 0 | |
reserved_sgpr_first = 0 | |
reserved_sgpr_count = 0 | |
debug_wavefront_private_segment_offset_sgpr = 0 | |
debug_private_segment_buffer_sgpr = 0 | |
kernarg_segment_alignment = 4 | |
group_segment_alignment = 4 | |
private_segment_alignment = 4 | |
wavefront_size = 6 | |
call_convention = 0 | |
runtime_loader_kernel_symbol = 0 | |
.end_amd_kernel_code_t | |
; BB#0: | |
s_load_dwordx2 s[0:1], s[4:5], 0x0 | |
v_mov_b32_e32 v1, 0 | |
s_mov_b32 s3, 0xf000 | |
s_mov_b32 s2, 0 | |
v_lshl_b64 v[2:3], v[0:1], 2 | |
s_waitcnt lgkmcnt(0) | |
buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 | |
s_endpgm | |
.Lfunc_end0: | |
.size test, .Lfunc_end0-test | |
.section .AMDGPU.csdata | |
; Kernel info: | |
; codeLenInByte = 44 | |
; NumSgprs: 6 | |
; NumVgprs: 4 | |
; FloatMode: 192 | |
; IeeeMode: 1 | |
; ScratchSize: 0 | |
; LDSByteSize: 0 bytes/workgroup (compile time only) | |
; SGPRBlocks: 0 | |
; VGPRBlocks: 0 | |
; NumSGPRsForWavesPerEU: 6 | |
; NumVGPRsForWavesPerEU: 4 | |
; ReservedVGPRFirst: 0 | |
; ReservedVGPRCount: 0 | |
; COMPUTE_PGM_RSRC2:USER_SGPR: 6 | |
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 | |
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 | |
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 | |
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 | |
.ident "clang version 4.0.0 (https://github.com/llvm-mirror/clang.git a6a1d3f3aa6c1d0018d84c3d6c26a552075982ab) (https://github.com/llvm-mirror/llvm.git 6a630d742074be775ca242c52071f5dcc5538b5b)" | |
.section ".note.GNU-stack" | |
.text | |
.section .AMDGPU.config | |
.long 47176 | |
.long 11272257 | |
.long 47180 | |
.long 144 | |
.long 47200 | |
.long 0 | |
.long 4 | |
.long 0 | |
.long 8 | |
.long 0 | |
.text | |
.globl memset_f3 | |
.p2align 8 | |
.type memset_f3,@function | |
.amdgpu_hsa_kernel memset_f3 | |
memset_f3: ; @memset_f3 | |
.amd_kernel_code_t | |
amd_code_version_major = 1 | |
amd_code_version_minor = 0 | |
amd_machine_kind = 1 | |
amd_machine_version_major = 7 | |
amd_machine_version_minor = 0 | |
amd_machine_version_stepping = 1 | |
kernel_code_entry_byte_offset = 256 | |
kernel_code_prefetch_byte_size = 0 | |
max_scratch_backing_memory_byte_size = 0 | |
granulated_workitem_vgpr_count = 1 | |
granulated_wavefront_sgpr_count = 1 | |
priority = 0 | |
float_mode = 192 | |
priv = 0 | |
enable_dx10_clamp = 1 | |
debug_mode = 0 | |
enable_ieee_mode = 1 | |
enable_sgpr_private_segment_wave_byte_offset = 0 | |
user_sgpr_count = 8 | |
enable_sgpr_workgroup_id_x = 1 | |
enable_sgpr_workgroup_id_y = 0 | |
enable_sgpr_workgroup_id_z = 0 | |
enable_sgpr_workgroup_info = 0 | |
enable_vgpr_workitem_id = 0 | |
enable_exception_msb = 0 | |
granulated_lds_size = 0 | |
enable_exception = 0 | |
enable_sgpr_private_segment_buffer = 1 | |
enable_sgpr_dispatch_ptr = 1 | |
enable_sgpr_queue_ptr = 0 | |
enable_sgpr_kernarg_segment_ptr = 1 | |
enable_sgpr_dispatch_id = 0 | |
enable_sgpr_flat_scratch_init = 0 | |
enable_sgpr_private_segment_size = 0 | |
enable_sgpr_grid_workgroup_count_x = 0 | |
enable_sgpr_grid_workgroup_count_y = 0 | |
enable_sgpr_grid_workgroup_count_z = 0 | |
enable_ordered_append_gds = 0 | |
private_element_size = 1 | |
is_ptr64 = 1 | |
is_dynamic_callstack = 0 | |
is_debug_enabled = 0 | |
is_xnack_enabled = 0 | |
workitem_private_segment_byte_size = 0 | |
workgroup_group_segment_byte_size = 0 | |
gds_segment_byte_size = 0 | |
kernarg_segment_byte_size = 32 | |
workgroup_fbarrier_count = 0 | |
wavefront_sgpr_count = 11 | |
workitem_vgpr_count = 5 | |
reserved_vgpr_first = 0 | |
reserved_vgpr_count = 0 | |
reserved_sgpr_first = 0 | |
reserved_sgpr_count = 0 | |
debug_wavefront_private_segment_offset_sgpr = 0 | |
debug_private_segment_buffer_sgpr = 0 | |
kernarg_segment_alignment = 4 | |
group_segment_alignment = 4 | |
private_segment_alignment = 4 | |
wavefront_size = 6 | |
call_convention = 0 | |
runtime_loader_kernel_symbol = 0 | |
.end_amd_kernel_code_t | |
; BB#0: | |
s_load_dword s0, s[4:5], 0x1 | |
s_load_dword s1, s[6:7], 0x5 | |
s_mov_b32 s2, 0 | |
s_waitcnt lgkmcnt(0) | |
s_and_b32 s0, s0, 0xffff | |
s_mul_i32 s0, s0, s8 | |
v_mov_b32_e32 v1, s0 | |
s_load_dword s0, s[6:7], 0x3 | |
v_add_i32_e32 v0, vcc, v0, v1 | |
v_add_i32_e32 v0, vcc, s1, v0 | |
s_waitcnt lgkmcnt(0) | |
v_cmp_gt_u32_e32 vcc, s0, v0 | |
s_and_saveexec_b64 s[0:1], vcc | |
s_xor_b64 s[4:5], exec, s[0:1] | |
; mask branch BB0_2 | |
BB0_1: | |
s_load_dwordx2 s[0:1], s[6:7], 0x0 | |
s_load_dword s6, s[6:7], 0x2 | |
v_mov_b32_e32 v1, 0 | |
v_lshl_b64 v[3:4], v[0:1], 4 | |
s_mov_b32 s3, 0xf000 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v0, s6 | |
v_mov_b32_e32 v1, v0 | |
v_mov_b32_e32 v2, v0 | |
buffer_store_dwordx4 v[0:3], v[3:4], s[0:3], 0 addr64 | |
s_waitcnt vmcnt(0) expcnt(0) | |
BB0_2: | |
s_or_b64 exec, exec, s[4:5] | |
s_endpgm | |
.Lfunc_end0: | |
.size memset_f3, .Lfunc_end0-memset_f3 | |
.section .AMDGPU.csdata | |
; Kernel info: | |
; codeLenInByte = 124 | |
; NumSgprs: 11 | |
; NumVgprs: 5 | |
; FloatMode: 192 | |
; IeeeMode: 1 | |
; ScratchSize: 0 | |
; LDSByteSize: 0 bytes/workgroup (compile time only) | |
; SGPRBlocks: 1 | |
; VGPRBlocks: 1 | |
; NumSGPRsForWavesPerEU: 11 | |
; NumVGPRsForWavesPerEU: 5 | |
; ReservedVGPRFirst: 0 | |
; ReservedVGPRCount: 0 | |
; COMPUTE_PGM_RSRC2:USER_SGPR: 8 | |
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 | |
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 | |
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 | |
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 | |
.section .AMDGPU.config | |
.long 47176 | |
.long 11272256 | |
.long 47180 | |
.long 144 | |
.long 47200 | |
.long 0 | |
.long 4 | |
.long 0 | |
.long 8 | |
.long 0 | |
.text | |
.globl memset_f2 | |
.p2align 8 | |
.type memset_f2,@function | |
.amdgpu_hsa_kernel memset_f2 | |
memset_f2: ; @memset_f2 | |
.amd_kernel_code_t | |
amd_code_version_major = 1 | |
amd_code_version_minor = 0 | |
amd_machine_kind = 1 | |
amd_machine_version_major = 7 | |
amd_machine_version_minor = 0 | |
amd_machine_version_stepping = 1 | |
kernel_code_entry_byte_offset = 256 | |
kernel_code_prefetch_byte_size = 0 | |
max_scratch_backing_memory_byte_size = 0 | |
granulated_workitem_vgpr_count = 0 | |
granulated_wavefront_sgpr_count = 1 | |
priority = 0 | |
float_mode = 192 | |
priv = 0 | |
enable_dx10_clamp = 1 | |
debug_mode = 0 | |
enable_ieee_mode = 1 | |
enable_sgpr_private_segment_wave_byte_offset = 0 | |
user_sgpr_count = 8 | |
enable_sgpr_workgroup_id_x = 1 | |
enable_sgpr_workgroup_id_y = 0 | |
enable_sgpr_workgroup_id_z = 0 | |
enable_sgpr_workgroup_info = 0 | |
enable_vgpr_workitem_id = 0 | |
enable_exception_msb = 0 | |
granulated_lds_size = 0 | |
enable_exception = 0 | |
enable_sgpr_private_segment_buffer = 1 | |
enable_sgpr_dispatch_ptr = 1 | |
enable_sgpr_queue_ptr = 0 | |
enable_sgpr_kernarg_segment_ptr = 1 | |
enable_sgpr_dispatch_id = 0 | |
enable_sgpr_flat_scratch_init = 0 | |
enable_sgpr_private_segment_size = 0 | |
enable_sgpr_grid_workgroup_count_x = 0 | |
enable_sgpr_grid_workgroup_count_y = 0 | |
enable_sgpr_grid_workgroup_count_z = 0 | |
enable_ordered_append_gds = 0 | |
private_element_size = 1 | |
is_ptr64 = 1 | |
is_dynamic_callstack = 0 | |
is_debug_enabled = 0 | |
is_xnack_enabled = 0 | |
workitem_private_segment_byte_size = 0 | |
workgroup_group_segment_byte_size = 0 | |
gds_segment_byte_size = 0 | |
kernarg_segment_byte_size = 32 | |
workgroup_fbarrier_count = 0 | |
wavefront_sgpr_count = 11 | |
workitem_vgpr_count = 4 | |
reserved_vgpr_first = 0 | |
reserved_vgpr_count = 0 | |
reserved_sgpr_first = 0 | |
reserved_sgpr_count = 0 | |
debug_wavefront_private_segment_offset_sgpr = 0 | |
debug_private_segment_buffer_sgpr = 0 | |
kernarg_segment_alignment = 4 | |
group_segment_alignment = 4 | |
private_segment_alignment = 4 | |
wavefront_size = 6 | |
call_convention = 0 | |
runtime_loader_kernel_symbol = 0 | |
.end_amd_kernel_code_t | |
; BB#0: | |
s_load_dword s0, s[4:5], 0x1 | |
s_load_dword s1, s[6:7], 0x5 | |
s_mov_b32 s2, 0 | |
s_waitcnt lgkmcnt(0) | |
s_and_b32 s0, s0, 0xffff | |
s_mul_i32 s0, s0, s8 | |
v_mov_b32_e32 v1, s0 | |
s_load_dword s0, s[6:7], 0x3 | |
v_add_i32_e32 v0, vcc, v0, v1 | |
v_add_i32_e32 v0, vcc, s1, v0 | |
s_waitcnt lgkmcnt(0) | |
v_cmp_gt_u32_e32 vcc, s0, v0 | |
s_and_saveexec_b64 s[0:1], vcc | |
s_xor_b64 s[4:5], exec, s[0:1] | |
; mask branch BB1_2 | |
BB1_1: | |
s_load_dwordx2 s[0:1], s[6:7], 0x0 | |
s_load_dword s6, s[6:7], 0x2 | |
v_mov_b32_e32 v1, 0 | |
s_mov_b32 s3, 0xf000 | |
v_lshl_b64 v[0:1], v[0:1], 3 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v2, s6 | |
v_mov_b32_e32 v3, v2 | |
buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 | |
s_waitcnt vmcnt(0) expcnt(0) | |
BB1_2: | |
s_or_b64 exec, exec, s[4:5] | |
s_endpgm | |
.Lfunc_end1: | |
.size memset_f2, .Lfunc_end1-memset_f2 | |
.section .AMDGPU.csdata | |
; Kernel info: | |
; codeLenInByte = 120 | |
; NumSgprs: 11 | |
; NumVgprs: 4 | |
; FloatMode: 192 | |
; IeeeMode: 1 | |
; ScratchSize: 0 | |
; LDSByteSize: 0 bytes/workgroup (compile time only) | |
; SGPRBlocks: 1 | |
; VGPRBlocks: 0 | |
; NumSGPRsForWavesPerEU: 11 | |
; NumVGPRsForWavesPerEU: 4 | |
; ReservedVGPRFirst: 0 | |
; ReservedVGPRCount: 0 | |
; COMPUTE_PGM_RSRC2:USER_SGPR: 8 | |
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 | |
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 | |
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 | |
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 | |
.section .AMDGPU.config | |
.long 47176 | |
.long 11272256 | |
.long 47180 | |
.long 144 | |
.long 47200 | |
.long 0 | |
.long 4 | |
.long 0 | |
.long 8 | |
.long 0 | |
.text | |
.globl memset_f | |
.p2align 8 | |
.type memset_f,@function | |
.amdgpu_hsa_kernel memset_f | |
memset_f: ; @memset_f | |
.amd_kernel_code_t | |
amd_code_version_major = 1 | |
amd_code_version_minor = 0 | |
amd_machine_kind = 1 | |
amd_machine_version_major = 7 | |
amd_machine_version_minor = 0 | |
amd_machine_version_stepping = 1 | |
kernel_code_entry_byte_offset = 256 | |
kernel_code_prefetch_byte_size = 0 | |
max_scratch_backing_memory_byte_size = 0 | |
granulated_workitem_vgpr_count = 0 | |
granulated_wavefront_sgpr_count = 1 | |
priority = 0 | |
float_mode = 192 | |
priv = 0 | |
enable_dx10_clamp = 1 | |
debug_mode = 0 | |
enable_ieee_mode = 1 | |
enable_sgpr_private_segment_wave_byte_offset = 0 | |
user_sgpr_count = 8 | |
enable_sgpr_workgroup_id_x = 1 | |
enable_sgpr_workgroup_id_y = 0 | |
enable_sgpr_workgroup_id_z = 0 | |
enable_sgpr_workgroup_info = 0 | |
enable_vgpr_workitem_id = 0 | |
enable_exception_msb = 0 | |
granulated_lds_size = 0 | |
enable_exception = 0 | |
enable_sgpr_private_segment_buffer = 1 | |
enable_sgpr_dispatch_ptr = 1 | |
enable_sgpr_queue_ptr = 0 | |
enable_sgpr_kernarg_segment_ptr = 1 | |
enable_sgpr_dispatch_id = 0 | |
enable_sgpr_flat_scratch_init = 0 | |
enable_sgpr_private_segment_size = 0 | |
enable_sgpr_grid_workgroup_count_x = 0 | |
enable_sgpr_grid_workgroup_count_y = 0 | |
enable_sgpr_grid_workgroup_count_z = 0 | |
enable_ordered_append_gds = 0 | |
private_element_size = 1 | |
is_ptr64 = 1 | |
is_dynamic_callstack = 0 | |
is_debug_enabled = 0 | |
is_xnack_enabled = 0 | |
workitem_private_segment_byte_size = 0 | |
workgroup_group_segment_byte_size = 0 | |
gds_segment_byte_size = 0 | |
kernarg_segment_byte_size = 32 | |
workgroup_fbarrier_count = 0 | |
wavefront_sgpr_count = 11 | |
workitem_vgpr_count = 3 | |
reserved_vgpr_first = 0 | |
reserved_vgpr_count = 0 | |
reserved_sgpr_first = 0 | |
reserved_sgpr_count = 0 | |
debug_wavefront_private_segment_offset_sgpr = 0 | |
debug_private_segment_buffer_sgpr = 0 | |
kernarg_segment_alignment = 4 | |
group_segment_alignment = 4 | |
private_segment_alignment = 4 | |
wavefront_size = 6 | |
call_convention = 0 | |
runtime_loader_kernel_symbol = 0 | |
.end_amd_kernel_code_t | |
; BB#0: | |
s_load_dword s0, s[4:5], 0x1 | |
s_load_dword s1, s[6:7], 0x5 | |
s_mov_b32 s2, 0 | |
s_waitcnt lgkmcnt(0) | |
s_and_b32 s0, s0, 0xffff | |
s_mul_i32 s0, s0, s8 | |
v_mov_b32_e32 v1, s0 | |
s_load_dword s0, s[6:7], 0x3 | |
v_add_i32_e32 v0, vcc, v0, v1 | |
v_add_i32_e32 v0, vcc, s1, v0 | |
s_waitcnt lgkmcnt(0) | |
v_cmp_gt_u32_e32 vcc, s0, v0 | |
s_and_saveexec_b64 s[0:1], vcc | |
s_xor_b64 s[4:5], exec, s[0:1] | |
; mask branch BB2_2 | |
BB2_1: | |
s_load_dwordx2 s[0:1], s[6:7], 0x0 | |
s_load_dword s6, s[6:7], 0x2 | |
v_mov_b32_e32 v1, 0 | |
s_mov_b32 s3, 0xf000 | |
v_lshl_b64 v[0:1], v[0:1], 2 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v2, s6 | |
buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 | |
s_waitcnt vmcnt(0) expcnt(0) | |
BB2_2: | |
s_or_b64 exec, exec, s[4:5] | |
s_endpgm | |
.Lfunc_end2: | |
.size memset_f, .Lfunc_end2-memset_f | |
.section .AMDGPU.csdata | |
; Kernel info: | |
; codeLenInByte = 116 | |
; NumSgprs: 11 | |
; NumVgprs: 3 | |
; FloatMode: 192 | |
; IeeeMode: 1 | |
; ScratchSize: 0 | |
; LDSByteSize: 0 bytes/workgroup (compile time only) | |
; SGPRBlocks: 1 | |
; VGPRBlocks: 0 | |
; NumSGPRsForWavesPerEU: 11 | |
; NumVGPRsForWavesPerEU: 3 | |
; ReservedVGPRFirst: 0 | |
; ReservedVGPRCount: 0 | |
; COMPUTE_PGM_RSRC2:USER_SGPR: 8 | |
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 | |
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 | |
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 | |
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 | |
.section .AMDGPU.config | |
.long 47176 | |
.long 11272256 | |
.long 47180 | |
.long 144 | |
.long 47200 | |
.long 0 | |
.long 4 | |
.long 0 | |
.long 8 | |
.long 0 | |
.text | |
.globl zero_e_fshift | |
.p2align 8 | |
.type zero_e_fshift,@function | |
.amdgpu_hsa_kernel zero_e_fshift | |
zero_e_fshift: ; @zero_e_fshift | |
.amd_kernel_code_t | |
amd_code_version_major = 1 | |
amd_code_version_minor = 0 | |
amd_machine_kind = 1 | |
amd_machine_version_major = 7 | |
amd_machine_version_minor = 0 | |
amd_machine_version_stepping = 1 | |
kernel_code_entry_byte_offset = 256 | |
kernel_code_prefetch_byte_size = 0 | |
max_scratch_backing_memory_byte_size = 0 | |
granulated_workitem_vgpr_count = 0 | |
granulated_wavefront_sgpr_count = 1 | |
priority = 0 | |
float_mode = 192 | |
priv = 0 | |
enable_dx10_clamp = 1 | |
debug_mode = 0 | |
enable_ieee_mode = 1 | |
enable_sgpr_private_segment_wave_byte_offset = 0 | |
user_sgpr_count = 8 | |
enable_sgpr_workgroup_id_x = 1 | |
enable_sgpr_workgroup_id_y = 0 | |
enable_sgpr_workgroup_id_z = 0 | |
enable_sgpr_workgroup_info = 0 | |
enable_vgpr_workitem_id = 0 | |
enable_exception_msb = 0 | |
granulated_lds_size = 0 | |
enable_exception = 0 | |
enable_sgpr_private_segment_buffer = 1 | |
enable_sgpr_dispatch_ptr = 1 | |
enable_sgpr_queue_ptr = 0 | |
enable_sgpr_kernarg_segment_ptr = 1 | |
enable_sgpr_dispatch_id = 0 | |
enable_sgpr_flat_scratch_init = 0 | |
enable_sgpr_private_segment_size = 0 | |
enable_sgpr_grid_workgroup_count_x = 0 | |
enable_sgpr_grid_workgroup_count_y = 0 | |
enable_sgpr_grid_workgroup_count_z = 0 | |
enable_ordered_append_gds = 0 | |
private_element_size = 1 | |
is_ptr64 = 1 | |
is_dynamic_callstack = 0 | |
is_debug_enabled = 0 | |
is_xnack_enabled = 0 | |
workitem_private_segment_byte_size = 0 | |
workgroup_group_segment_byte_size = 0 | |
gds_segment_byte_size = 0 | |
kernarg_segment_byte_size = 44 | |
workgroup_fbarrier_count = 0 | |
wavefront_sgpr_count = 14 | |
workitem_vgpr_count = 4 | |
reserved_vgpr_first = 0 | |
reserved_vgpr_count = 0 | |
reserved_sgpr_first = 0 | |
reserved_sgpr_count = 0 | |
debug_wavefront_private_segment_offset_sgpr = 0 | |
debug_private_segment_buffer_sgpr = 0 | |
kernarg_segment_alignment = 4 | |
group_segment_alignment = 4 | |
private_segment_alignment = 4 | |
wavefront_size = 6 | |
call_convention = 0 | |
runtime_loader_kernel_symbol = 0 | |
.end_amd_kernel_code_t | |
; BB#0: | |
s_load_dword s0, s[4:5], 0x1 | |
s_load_dword s1, s[6:7], 0x8 | |
s_mov_b32 s2, 0 | |
s_waitcnt lgkmcnt(0) | |
s_and_b32 s0, s0, 0xffff | |
s_mul_i32 s0, s0, s8 | |
v_mov_b32_e32 v1, s0 | |
s_load_dword s0, s[6:7], 0x6 | |
v_add_i32_e32 v0, vcc, v0, v1 | |
v_add_i32_e32 v0, vcc, s1, v0 | |
s_waitcnt lgkmcnt(0) | |
v_cmp_gt_u32_e32 vcc, s0, v0 | |
s_and_saveexec_b64 s[0:1], vcc | |
s_xor_b64 s[4:5], exec, s[0:1] | |
; mask branch BB3_2 | |
BB3_1: | |
s_load_dwordx2 s[0:1], s[6:7], 0x0 | |
v_mov_b32_e32 v1, 0 | |
s_mov_b32 s3, 0xf000 | |
v_lshl_b64 v[2:3], v[0:1], 2 | |
s_waitcnt lgkmcnt(0) | |
buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 | |
s_waitcnt vmcnt(0) expcnt(0) | |
BB3_2: | |
s_or_b64 exec, exec, s[4:5] | |
v_cmp_eq_u32_e32 vcc, 0, v0 | |
s_and_saveexec_b64 s[0:1], vcc | |
s_xor_b64 s[0:1], exec, s[0:1] | |
; mask branch BB3_4 | |
BB3_3: | |
s_load_dwordx2 s[8:9], s[6:7], 0x2 | |
s_load_dwordx2 s[4:5], s[6:7], 0x4 | |
s_mov_b32 s11, 0xf000 | |
s_mov_b32 s10, -1 | |
v_mov_b32_e32 v0, 0 | |
s_mov_b32 s6, s10 | |
s_mov_b32 s7, s11 | |
s_waitcnt lgkmcnt(0) | |
buffer_store_dword v0, off, s[8:11], 0 | |
buffer_store_dword v0, off, s[4:7], 0 | |
s_waitcnt vmcnt(0) expcnt(0) | |
BB3_4: | |
s_or_b64 exec, exec, s[0:1] | |
s_endpgm | |
.Lfunc_end3: | |
.size zero_e_fshift, .Lfunc_end3-zero_e_fshift | |
.section .AMDGPU.csdata | |
; Kernel info: | |
; codeLenInByte = 180 | |
; NumSgprs: 14 | |
; NumVgprs: 4 | |
; FloatMode: 192 | |
; IeeeMode: 1 | |
; ScratchSize: 0 | |
; LDSByteSize: 0 bytes/workgroup (compile time only) | |
; SGPRBlocks: 1 | |
; VGPRBlocks: 0 | |
; NumSGPRsForWavesPerEU: 14 | |
; NumVGPRsForWavesPerEU: 4 | |
; ReservedVGPRFirst: 0 | |
; ReservedVGPRCount: 0 | |
; COMPUTE_PGM_RSRC2:USER_SGPR: 8 | |
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 | |
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 | |
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 | |
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 | |
.section .AMDGPU.config | |
.long 47176 | |
.long 11272534 | |
.long 47180 | |
.long 2192 | |
.long 47200 | |
.long 0 | |
.long 4 | |
.long 0 | |
.long 8 | |
.long 0 | |
.text | |
.globl nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl | |
.p2align 8 | |
.type nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl,@function | |
.amdgpu_hsa_kernel nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl | |
nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl: ; @nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl | |
.amd_kernel_code_t | |
amd_code_version_major = 1 | |
amd_code_version_minor = 0 | |
amd_machine_kind = 1 | |
amd_machine_version_major = 7 | |
amd_machine_version_minor = 0 | |
amd_machine_version_stepping = 1 | |
kernel_code_entry_byte_offset = 256 | |
kernel_code_prefetch_byte_size = 0 | |
max_scratch_backing_memory_byte_size = 0 | |
granulated_workitem_vgpr_count = 22 | |
granulated_wavefront_sgpr_count = 5 | |
priority = 0 | |
float_mode = 192 | |
priv = 0 | |
enable_dx10_clamp = 1 | |
debug_mode = 0 | |
enable_ieee_mode = 1 | |
enable_sgpr_private_segment_wave_byte_offset = 0 | |
user_sgpr_count = 8 | |
enable_sgpr_workgroup_id_x = 1 | |
enable_sgpr_workgroup_id_y = 0 | |
enable_sgpr_workgroup_id_z = 0 | |
enable_sgpr_workgroup_info = 0 | |
enable_vgpr_workitem_id = 1 | |
enable_exception_msb = 0 | |
granulated_lds_size = 0 | |
enable_exception = 0 | |
enable_sgpr_private_segment_buffer = 1 | |
enable_sgpr_dispatch_ptr = 1 | |
enable_sgpr_queue_ptr = 0 | |
enable_sgpr_kernarg_segment_ptr = 1 | |
enable_sgpr_dispatch_id = 0 | |
enable_sgpr_flat_scratch_init = 0 | |
enable_sgpr_private_segment_size = 0 | |
enable_sgpr_grid_workgroup_count_x = 0 | |
enable_sgpr_grid_workgroup_count_y = 0 | |
enable_sgpr_grid_workgroup_count_z = 0 | |
enable_ordered_append_gds = 0 | |
private_element_size = 1 | |
is_ptr64 = 1 | |
is_dynamic_callstack = 0 | |
is_debug_enabled = 0 | |
is_xnack_enabled = 0 | |
workitem_private_segment_byte_size = 0 | |
workgroup_group_segment_byte_size = 0 | |
gds_segment_byte_size = 0 | |
kernarg_segment_byte_size = 232 | |
workgroup_fbarrier_count = 0 | |
wavefront_sgpr_count = 42 | |
workitem_vgpr_count = 92 | |
reserved_vgpr_first = 0 | |
reserved_vgpr_count = 0 | |
reserved_sgpr_first = 0 | |
reserved_sgpr_count = 0 | |
debug_wavefront_private_segment_offset_sgpr = 0 | |
debug_private_segment_buffer_sgpr = 0 | |
kernarg_segment_alignment = 4 | |
group_segment_alignment = 4 | |
private_segment_alignment = 4 | |
wavefront_size = 6 | |
call_convention = 0 | |
runtime_loader_kernel_symbol = 0 | |
.end_amd_kernel_code_t | |
; BB#0: | |
s_load_dwordx2 s[0:1], s[6:7], 0x2c | |
s_mov_b32 s9, 0 | |
s_lshl_b64 s[10:11], s[8:9], 4 | |
v_mov_b32_e32 v3, s10 | |
s_mov_b32 s2, s9 | |
s_mov_b32 s3, 0xf000 | |
v_mov_b32_e32 v4, s11 | |
s_waitcnt lgkmcnt(0) | |
buffer_load_dwordx4 v[31:34], v[3:4], s[0:3], 0 addr64 | |
v_mov_b32_e32 v2, v0 | |
s_load_dwordx2 s[8:9], s[6:7], 0x24 | |
s_load_dwordx2 s[16:17], s[6:7], 0x18 | |
s_mov_b64 s[18:19], s[2:3] | |
s_mov_b64 s[10:11], s[2:3] | |
s_load_dword s14, s[6:7], 0x33 | |
s_load_dword s0, s[6:7], 0x2 | |
s_load_dwordx2 s[20:21], s[6:7], 0x22 | |
s_mov_b32 m0, -1 | |
s_mov_b64 s[22:23], s[2:3] | |
s_load_dword s1, s[4:5], 0x1 | |
s_waitcnt lgkmcnt(0) | |
s_add_i32 s15, s14, 0x420 | |
s_waitcnt vmcnt(0) | |
v_lshlrev_b32_e32 v40, 3, v31 | |
v_mul_lo_i32 v4, v32, 3 | |
v_add_i32_e32 v0, vcc, v1, v40 | |
v_lshlrev_b32_e32 v0, 3, v0 | |
v_add_i32_e32 v9, vcc, v2, v0 | |
v_ashrrev_i32_e32 v10, 31, v9 | |
v_ashrrev_i32_e32 v5, 31, v4 | |
v_lshl_b64 v[11:12], v[4:5], 2 | |
v_lshl_b64 v[6:7], v[9:10], 4 | |
buffer_load_dwordx4 v[5:8], v[6:7], s[16:19], 0 addr64 | |
buffer_load_dwordx2 v[13:14], v[11:12], s[8:11], 0 addr64 | |
buffer_load_dword v0, v[11:12], s[8:11], 0 addr64 offset:8 | |
s_waitcnt vmcnt(1) | |
v_add_f32_e32 v11, v5, v13 | |
s_waitcnt vmcnt(0) | |
v_add_f32_e32 v5, v7, v0 | |
v_lshlrev_b32_e32 v0, 3, v1 | |
v_add_i32_e32 v39, vcc, v2, v0 | |
v_lshlrev_b32_e32 v3, 4, v39 | |
v_add_f32_e32 v12, v6, v14 | |
v_mul_f32_e32 v6, s0, v8 | |
v_add_i32_e32 v3, vcc, s14, v3 | |
ds_write2_b64 v3, v[11:12], v[5:6] offset1:1 | |
s_waitcnt lgkmcnt(0) | |
v_lshl_b64 v[5:6], v[9:10], 3 | |
buffer_load_dwordx2 v[5:6], v[5:6], s[20:23], 0 addr64 | |
s_and_b32 s0, s1, 0xffff | |
v_mad_u32_u24 v52, s0, v1, v2 | |
v_lshlrev_b32_e32 v7, 3, v39 | |
v_add_i32_e32 v7, vcc, s15, v7 | |
v_or_b32_e32 v3, 32, v52 | |
v_lshrrev_b32_e32 v41, 5, v52 | |
v_cmp_eq_u32_e32 vcc, 32, v3 | |
s_waitcnt vmcnt(0) | |
ds_write_b64 v7, v[5:6] | |
s_and_saveexec_b64 s[0:1], vcc | |
s_xor_b64 s[0:1], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB4_2 | |
BB4_1: | |
v_lshlrev_b32_e32 v3, 2, v41 | |
v_add_i32_e32 v3, vcc, s14, v3 | |
v_mov_b32_e32 v5, 0 | |
s_mov_b32 m0, -1 | |
ds_write_b32 v3, v5 offset:2336 | |
s_waitcnt lgkmcnt(0) | |
BB4_2: ; %.preheader447587 | |
s_or_b64 exec, exec, s[0:1] | |
s_barrier | |
s_load_dwordx2 s[12:13], s[6:7], 0x1a | |
v_cmp_lt_i32_e32 vcc, v33, v34 | |
s_and_b64 vcc, exec, vcc | |
v_mov_b32_e32 v12, -1 | |
s_waitcnt lgkmcnt(0) | |
s_mov_b64 vcc, vcc | |
s_cbranch_vccnz BB4_4 | |
; BB#3: ; %.preheader447587.._crit_edge_crit_edge | |
v_mov_b32_e32 v43, 0 | |
v_lshlrev_b32_e32 v3, 2, v52 | |
v_mov_b32_e32 v44, v43 | |
v_mov_b32_e32 v45, v43 | |
v_add_i32_e32 v3, vcc, s14, v3 | |
v_mov_b32_e32 v8, v43 | |
v_add_i32_e32 v5, vcc, 0x620, v3 | |
v_add_i32_e32 v6, vcc, 0x720, v3 | |
v_add_i32_e32 v7, vcc, 0x820, v3 | |
v_mov_b32_e32 v12, 0 | |
v_mov_b32_e32 v9, v44 | |
v_mov_b32_e32 v10, v45 | |
v_mov_b32_e32 v11, v46 | |
s_branch BB4_5 | |
BB4_4: | |
; implicit-def: %VGPR43_VGPR44_VGPR45_VGPR46 | |
; implicit-def: %VGPR5 | |
; implicit-def: %VGPR6 | |
; implicit-def: %VGPR7 | |
; implicit-def: %VGPR8_VGPR9_VGPR10_VGPR11 | |
BB4_5: ; %Flow1190 | |
s_load_dwordx2 s[8:9], s[6:7], 0x20 | |
v_cmp_ne_u32_e32 vcc, 0, v12 | |
v_cndmask_b32_e64 v11, 0, 1, vcc | |
v_cmp_ne_u32_e32 vcc, 1, v11 | |
v_mov_b32_e32 v35, v43 | |
v_mov_b32_e32 v27, v43 | |
v_mov_b32_e32 v23, v43 | |
v_mov_b32_e32 v19, v43 | |
v_mov_b32_e32 v15, v43 | |
v_mov_b32_e32 v11, v43 | |
s_movk_i32 s18, 0x620 | |
v_mov_b32_e32 v3, 0 | |
s_add_i32 s4, s14, s18 | |
s_and_b64 vcc, exec, vcc | |
v_mov_b32_e32 v36, v44 | |
v_mov_b32_e32 v37, v45 | |
v_mov_b32_e32 v38, v46 | |
v_mov_b32_e32 v28, v44 | |
v_mov_b32_e32 v29, v45 | |
v_mov_b32_e32 v30, v46 | |
v_mov_b32_e32 v24, v44 | |
v_mov_b32_e32 v25, v45 | |
v_mov_b32_e32 v26, v46 | |
v_mov_b32_e32 v20, v44 | |
v_mov_b32_e32 v21, v45 | |
v_mov_b32_e32 v22, v46 | |
v_mov_b32_e32 v16, v44 | |
v_mov_b32_e32 v17, v45 | |
v_mov_b32_e32 v18, v46 | |
v_mov_b32_e32 v12, v44 | |
v_mov_b32_e32 v13, v45 | |
v_mov_b32_e32 v14, v46 | |
s_waitcnt lgkmcnt(0) | |
s_mov_b64 vcc, vcc | |
s_cbranch_vccnz BB4_172 | |
; BB#6: ; %.lr.ph | |
v_or_b32_e32 v5, 4, v1 | |
v_cmp_eq_u32_e32 vcc, 4, v5 | |
v_cmp_gt_u32_e64 s[0:1], 4, v2 | |
s_and_b64 s[26:27], s[0:1], vcc | |
v_add_i32_e32 v5, vcc, v1, v2 | |
v_and_b32_e32 v8, 4, v1 | |
s_add_i32 s22, s14, 0x400 | |
v_lshlrev_b32_e32 v5, 2, v5 | |
v_lshlrev_b32_e32 v8, 2, v8 | |
v_add_i32_e32 v50, vcc, s22, v5 | |
v_lshlrev_b32_e32 v5, 2, v52 | |
v_and_b32_e32 v48, 31, v52 | |
v_add_i32_e32 v52, vcc, s22, v8 | |
v_lshlrev_b32_e32 v8, 4, v2 | |
s_load_dword s19, s[6:7], 0x5 | |
v_add_i32_e32 v53, vcc, s14, v8 | |
v_lshlrev_b32_e32 v8, 3, v2 | |
v_add_i32_e32 v54, vcc, s15, v8 | |
v_mov_b32_e32 v8, 0 | |
s_load_dwordx2 s[10:11], s[6:7], 0x30 | |
s_load_dword s5, s[6:7], 0x9 | |
s_load_dwordx2 s[24:25], s[6:7], 0x2e | |
v_mov_b32_e32 v9, v8 | |
v_mov_b32_e32 v10, v8 | |
v_mov_b32_e32 v14, v11 | |
v_mov_b32_e32 v13, v10 | |
v_mov_b32_e32 v12, v9 | |
v_mov_b32_e32 v11, v8 | |
v_add_i32_e32 v7, vcc, s14, v5 | |
v_mov_b32_e32 v18, v11 | |
v_mov_b32_e32 v22, v11 | |
v_mov_b32_e32 v26, v11 | |
v_mov_b32_e32 v30, v11 | |
v_mov_b32_e32 v38, v11 | |
v_mov_b32_e32 v46, v11 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e64 v47, s19, s19 | |
v_mov_b32_e32 v42, 0 | |
s_mov_b32 s30, 0 | |
v_mov_b32_e32 v49, v42 | |
v_cmp_gt_u32_e64 s[0:1], v1, v2 | |
v_cmp_ne_u32_e64 s[2:3], 22, v32 | |
v_mul_f32_e32 v51, s19, v47 | |
v_add_i32_e32 v5, vcc, s18, v7 | |
v_add_i32_e32 v6, vcc, 0x720, v7 | |
v_add_i32_e32 v7, vcc, 0x820, v7 | |
s_mov_b32 s31, 0xf000 | |
s_mov_b64 s[28:29], 0 | |
v_ashrrev_i32_e32 v56, 31, v33 | |
v_mov_b32_e32 v55, v33 | |
v_or_b32_e32 v33, 7, v40 | |
v_or_b32_e32 v57, 6, v40 | |
v_or_b32_e32 v58, 5, v40 | |
v_or_b32_e32 v59, 4, v40 | |
v_or_b32_e32 v60, 3, v40 | |
v_or_b32_e32 v61, 2, v40 | |
v_or_b32_e32 v62, 1, v40 | |
v_mov_b32_e32 v17, v10 | |
v_mov_b32_e32 v16, v9 | |
v_mov_b32_e32 v15, v8 | |
v_mov_b32_e32 v21, v10 | |
v_mov_b32_e32 v20, v9 | |
v_mov_b32_e32 v19, v8 | |
v_mov_b32_e32 v25, v10 | |
v_mov_b32_e32 v24, v9 | |
v_mov_b32_e32 v23, v8 | |
v_mov_b32_e32 v29, v10 | |
v_mov_b32_e32 v28, v9 | |
v_mov_b32_e32 v27, v8 | |
v_mov_b32_e32 v37, v10 | |
v_mov_b32_e32 v36, v9 | |
v_mov_b32_e32 v35, v8 | |
v_mov_b32_e32 v45, v10 | |
v_mov_b32_e32 v44, v9 | |
v_mov_b32_e32 v43, v8 | |
; implicit-def: %VGPR63_VGPR64_VGPR65_VGPR66 | |
BB4_7: ; =>This Loop Header: Depth=1 | |
; Child Loop BB4_47 Depth 2 | |
; Child Loop BB4_87 Depth 2 | |
; Child Loop BB4_127 Depth 2 | |
; Child Loop BB4_167 Depth 2 | |
v_lshl_b64 v[63:64], v[55:56], 5 | |
v_add_i32_e32 v67, vcc, s24, v63 | |
v_mov_b32_e32 v63, s25 | |
v_addc_u32_e32 v64, vcc, v64, v63, vcc | |
v_lshl_b64 v[68:69], v[41:42], 3 | |
v_add_i32_e32 v67, vcc, v67, v68 | |
v_addc_u32_e32 v68, vcc, v64, v69, vcc | |
buffer_load_dwordx2 v[67:68], v[67:68], s[28:31], 0 addr64 offset:16 | |
s_waitcnt vmcnt(0) | |
v_cmp_ne_u32_e32 vcc, 0, v67 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[32:33], exec, s[14:15] | |
; mask branch BB4_171 | |
s_cbranch_execz BB4_171 | |
BB4_8: ; in Loop: Header=BB4_7 Depth=1 | |
v_ashrrev_i32_e32 v69, 31, v68 | |
v_lshl_b64 v[63:64], v[68:69], 7 | |
v_add_i32_e32 v68, vcc, s10, v63 | |
v_mov_b32_e32 v63, s11 | |
v_addc_u32_e32 v64, vcc, v64, v63, vcc | |
v_lshl_b64 v[69:70], v[48:49], 2 | |
v_add_i32_e32 v68, vcc, v68, v69 | |
v_addc_u32_e32 v69, vcc, v64, v70, vcc | |
buffer_load_dword v63, v[68:69], s[28:31], 0 addr64 | |
s_and_saveexec_b64 s[14:15], s[26:27] | |
s_xor_b64 s[14:15], exec, s[14:15] | |
s_waitcnt vmcnt(0) | |
; mask branch BB4_10 | |
s_cbranch_execz BB4_10 | |
BB4_9: ; in Loop: Header=BB4_7 Depth=1 | |
v_lshl_b64 v[64:65], v[55:56], 5 | |
v_add_i32_e32 v68, vcc, s24, v64 | |
v_mov_b32_e32 v64, s25 | |
v_addc_u32_e32 v65, vcc, v65, v64, vcc | |
v_lshl_b64 v[69:70], v[2:3], 2 | |
v_add_i32_e32 v68, vcc, v68, v69 | |
v_addc_u32_e32 v69, vcc, v65, v70, vcc | |
buffer_load_dword v64, v[68:69], s[28:31], 0 addr64 | |
s_mov_b32 m0, -1 | |
s_waitcnt vmcnt(0) | |
ds_write_b32 v50, v64 | |
s_waitcnt lgkmcnt(0) | |
BB4_10: ; %.preheader.preheader | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_and_b32_e32 v64, 0xff, v67 | |
v_cmp_ne_u32_e32 vcc, 0, v64 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[34:35], exec, s[14:15] | |
; mask branch BB4_50 | |
s_cbranch_execz BB4_50 | |
BB4_11: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v74, v52 | |
s_mov_b64 s[18:19], s[30:31] | |
s_mov_b64 s[22:23], s[30:31] | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v64, 3, v74 | |
v_add_i32_e32 v64, vcc, v64, v1 | |
v_ashrrev_i32_e32 v65, 31, v64 | |
v_lshl_b64 v[68:69], v[64:65], 4 | |
v_lshl_b64 v[75:76], v[64:65], 3 | |
buffer_load_dwordx4 v[70:73], v[68:69], s[16:19], 0 addr64 | |
buffer_load_dwordx2 v[68:69], v[75:76], s[20:23], 0 addr64 | |
v_mov_b32_e32 v75, 0 | |
v_and_b32_e32 v65, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
s_waitcnt vmcnt(0) | |
; mask branch BB4_15 | |
s_cbranch_execz BB4_15 | |
BB4_12: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset1:1 | |
v_cmp_ne_u32_e32 vcc, v40, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v79, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v75, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v76, v72, v87 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mul_f32_e32 v75, s5, v75 | |
v_cmp_lt_f32_e32 vcc, v81, v75 | |
v_mov_b32_e32 v75, 0 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
; mask branch BB4_14 | |
s_cbranch_execz BB4_14 | |
BB4_13: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v77, 0x34cd15ae, v81 | |
v_rsq_f32_e32 v81, v77 | |
v_mul_f32_e32 v77, v47, v77 | |
v_mul_f32_e32 v82, v77, v77 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_madak_f32_e32 v83, v83, v82, 0x3ded3cb2 | |
v_mov_b32_e32 v84, 0x3c739487 | |
v_madak_f32_e32 v84, v84, v82, 0x3f01e2bc | |
v_mad_f32 v83, v83, v82, 1.0 | |
v_mac_f32_e32 v83, v77, v84 | |
v_mov_b32_e32 v84, 0xb2951928 | |
v_madak_f32_e32 v84, v84, v82, 0xb85ffb93 | |
v_mov_b32_e32 v85, 0x35c55945 | |
v_madak_f32_e32 v85, v85, v82, 0x3a83ca0c | |
v_madak_f32_e32 v84, v84, v82, 0xbc9ded90 | |
v_madak_f32_e32 v85, v85, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v82, v84, v82, 0xbf409397 | |
v_mac_f32_e32 v82, v77, v85 | |
v_rcp_f32_e32 v77, v83 | |
v_and_b32_e32 v80, 1, v63 | |
v_cmp_eq_u32_e32 vcc, 1, v80 | |
v_cndmask_b32_e64 v80, 0, 1.0, vcc | |
v_mul_f32_e32 v77, v51, v77 | |
v_mul_f32_e32 v77, v82, v77 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mul_f32_e32 v83, v80, v82 | |
v_mac_f32_e32 v77, v81, v83 | |
v_mul_f32_e32 v81, v82, v82 | |
s_mov_b32 m0, -1 | |
v_mul_f32_e32 v83, v80, v81 | |
ds_read_b64 v[80:81], v54 | |
v_mul_f32_e32 v83, v82, v83 | |
v_mul_f32_e32 v75, v73, v88 | |
v_mac_f32_e32 v46, v0, v65 | |
v_mul_f32_e64 v84, v65, -v0 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v81, v69, v81 | |
v_mul_f32_e64 v80, v68, -v80 | |
v_mac_f32_e32 v80, v83, v81 | |
v_mul_f32_e32 v81, v82, v83 | |
v_mul_f32_e32 v81, v80, v81 | |
v_mac_f32_e32 v81, v77, v75 | |
v_mad_f32 v45, v76, v81, v45 | |
v_mad_f32 v44, v78, v81, v44 | |
v_mad_f32 v43, v79, v81, v43 | |
v_mul_f32_e64 v80, v81, -v76 | |
v_mul_f32_e64 v77, v81, -v78 | |
v_mul_f32_e64 v75, v81, -v79 | |
BB4_14: ; %Flow1186 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_15: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_lshrrev_b32_e32 v65, 1, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB4_19 | |
s_cbranch_execz BB4_19 | |
BB4_16: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:16 offset1:17 | |
v_cmp_ne_u32_e32 vcc, v62, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_18 | |
s_cbranch_execz BB4_18 | |
BB4_17: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 1, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:64 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v37, v79, v81, v37 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v36, v78, v81, v36 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v35, v76, v81, v35 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v38, v0, v65 | |
BB4_18: ; %Flow1185 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_19: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_lshrrev_b32_e32 v65, 2, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB4_23 | |
s_cbranch_execz BB4_23 | |
BB4_20: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:32 offset1:33 | |
v_cmp_ne_u32_e32 vcc, v61, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_22 | |
s_cbranch_execz BB4_22 | |
BB4_21: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 2, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:128 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v29, v79, v81, v29 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v28, v78, v81, v28 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v27, v76, v81, v27 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v30, v0, v65 | |
BB4_22: ; %Flow1184 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_23: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_lshrrev_b32_e32 v65, 3, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB4_27 | |
s_cbranch_execz BB4_27 | |
BB4_24: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:48 offset1:49 | |
v_cmp_ne_u32_e32 vcc, v60, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_26 | |
s_cbranch_execz BB4_26 | |
BB4_25: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 3, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:192 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v25, v79, v81, v25 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v24, v78, v81, v24 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v23, v76, v81, v23 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v26, v0, v65 | |
BB4_26: ; %Flow1183 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_27: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_lshrrev_b32_e32 v65, 4, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB4_31 | |
s_cbranch_execz BB4_31 | |
BB4_28: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:64 offset1:65 | |
v_cmp_ne_u32_e32 vcc, v59, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_30 | |
s_cbranch_execz BB4_30 | |
BB4_29: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 4, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:256 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v21, v79, v81, v21 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v20, v78, v81, v20 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v19, v76, v81, v19 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v22, v0, v65 | |
BB4_30: ; %Flow1182 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_31: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_lshrrev_b32_e32 v65, 5, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB4_35 | |
s_cbranch_execz BB4_35 | |
BB4_32: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:80 offset1:81 | |
v_cmp_ne_u32_e32 vcc, v58, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_34 | |
s_cbranch_execz BB4_34 | |
BB4_33: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 5, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:320 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v17, v79, v81, v17 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v16, v78, v81, v16 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v15, v76, v81, v15 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v18, v0, v65 | |
BB4_34: ; %Flow1181 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_35: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_lshrrev_b32_e32 v65, 6, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB4_39 | |
s_cbranch_execz BB4_39 | |
BB4_36: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:96 offset1:97 | |
v_cmp_ne_u32_e32 vcc, v57, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_38 | |
s_cbranch_execz BB4_38 | |
BB4_37: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 6, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:384 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v13, v79, v81, v13 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v12, v78, v81, v12 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v11, v76, v81, v11 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v14, v0, v65 | |
BB4_38: ; %Flow1180 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_39: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_lshrrev_b32_e32 v65, 7, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB4_43 | |
s_cbranch_execz BB4_43 | |
BB4_40: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:112 offset1:113 | |
v_cmp_ne_u32_e32 vcc, v33, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v71, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v70, v70, v85 | |
v_mul_f32_e32 v74, v71, v71 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mac_f32_e32 v74, v70, v70 | |
v_mac_f32_e32 v74, v72, v72 | |
v_mul_f32_e32 v76, s5, v76 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v74, v76 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_42 | |
s_cbranch_execz BB4_42 | |
BB4_41: ; in Loop: Header=BB4_7 Depth=1 | |
v_mul_f32_e32 v79, v73, v88 | |
s_mov_b32 m0, -1 | |
v_mad_f32 v73, -v65, v0, v84 | |
v_max_f32_e32 v78, 0x34cd15ae, v74 | |
ds_read_b64 v[73:74], v54 offset:448 | |
v_mul_f32_e32 v81, v47, v78 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_lshrrev_b32_e32 v76, 7, v63 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v69, v69, v74 | |
v_mov_b32_e32 v74, 0x3c739487 | |
v_mul_f32_e64 v68, v68, -v73 | |
v_madak_f32_e32 v73, v83, v82, 0x3ded3cb2 | |
v_madak_f32_e32 v74, v74, v82, 0x3f01e2bc | |
v_mad_f32 v73, v73, v82, 1.0 | |
v_mac_f32_e32 v73, v81, v74 | |
v_mov_b32_e32 v74, 0xb2951928 | |
v_rcp_f32_e32 v73, v73 | |
v_madak_f32_e32 v74, v74, v82, 0xb85ffb93 | |
v_mov_b32_e32 v83, 0x35c55945 | |
v_madak_f32_e32 v83, v83, v82, 0x3a83ca0c | |
v_madak_f32_e32 v74, v74, v82, 0xbc9ded90 | |
v_madak_f32_e32 v83, v83, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v74, v74, v82, 0xbf409397 | |
v_mac_f32_e32 v74, v81, v83 | |
v_mul_f32_e32 v73, v51, v73 | |
v_mul_f32_e32 v73, v74, v73 | |
v_rsq_f32_e32 v74, v78 | |
v_and_b32_e32 v76, 1, v76 | |
v_cmp_eq_u32_e32 vcc, 1, v76 | |
v_cndmask_b32_e64 v76, 0, 1.0, vcc | |
v_mul_f32_e32 v78, v74, v74 | |
v_mul_f32_e32 v81, v76, v78 | |
v_mac_f32_e32 v73, v74, v81 | |
v_mul_f32_e32 v74, v78, v78 | |
v_mul_f32_e32 v74, v76, v74 | |
v_mul_f32_e32 v74, v78, v74 | |
v_mac_f32_e32 v68, v74, v69 | |
v_mul_f32_e32 v69, v78, v74 | |
v_mul_f32_e32 v68, v68, v69 | |
v_mac_f32_e32 v68, v73, v79 | |
v_mad_f32 v80, -v68, v72, v80 | |
v_mad_f32 v10, v72, v68, v10 | |
v_mad_f32 v77, -v68, v71, v77 | |
v_mad_f32 v9, v71, v68, v9 | |
v_mad_f32 v75, -v68, v70, v75 | |
v_mad_f32 v8, v70, v68, v8 | |
v_mac_f32_e32 v66, v0, v65 | |
BB4_42: ; %Flow1179 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_43: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v5, v75 | |
ds_write_b32 v6, v77 | |
ds_write_b32 v7, v80 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[18:19], exec, s[14:15] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB4_49 | |
s_cbranch_execz BB4_49 | |
BB4_44: ; in Loop: Header=BB4_7 Depth=1 | |
v_lshlrev_b32_e32 v68, 6, v2 | |
v_add_i32_e32 v65, vcc, v0, v68 | |
v_lshlrev_b32_e32 v65, 2, v65 | |
v_add_i32_e32 v69, vcc, s4, v65 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v65, v69 | |
v_add_i32_e32 v70, vcc, 8, v0 | |
v_or_b32_e32 v71, 1, v0 | |
v_cmp_lt_i32_e32 vcc, v71, v70 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB4_46 | |
s_cbranch_execz BB4_46 | |
BB4_45: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[70:71], v69 offset0:1 offset1:2 | |
v_or_b32_e32 v74, 3, v0 | |
v_add_i32_e32 v68, vcc, v74, v68 | |
v_lshlrev_b32_e32 v68, 2, v68 | |
ds_read2_b32 v[72:73], v69 offset0:3 offset1:4 | |
v_add_i32_e32 v68, vcc, s4, v68 | |
ds_read_b32 v75, v69 offset:28 | |
ds_read2_b32 v[68:69], v68 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v65, v65, v70 | |
v_add_f32_e32 v65, v71, v65 | |
v_add_f32_e32 v65, v72, v65 | |
v_add_f32_e32 v65, v73, v65 | |
v_add_f32_e32 v65, v68, v65 | |
v_add_f32_e32 v65, v69, v65 | |
v_add_f32_e32 v65, v75, v65 | |
BB4_46: ; %._crit_edge.i118 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_mul_lo_i32 v64, v64, 3 | |
v_mov_b32_e32 v72, s13 | |
s_mov_b64 s[14:15], s[30:31] | |
v_add_i32_e32 v68, vcc, v64, v2 | |
v_ashrrev_i32_e32 v69, 31, v68 | |
v_lshl_b64 v[70:71], v[68:69], 2 | |
v_add_i32_e32 v68, vcc, s12, v70 | |
v_addc_u32_e32 v69, vcc, v71, v72, vcc | |
buffer_load_dword v71, v[70:71], s[12:15], 0 addr64 | |
s_mov_b64 s[14:15], 0 | |
s_waitcnt vmcnt(0) | |
BB4_47: ; Parent Loop BB4_7 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v70, v65, v71 | |
v_mov_b32_e32 v73, v71 | |
v_mov_b32_e32 v72, v70 | |
buffer_atomic_cmpswap v[72:73], v[68:69], s[28:31], 0 addr64 glc | |
v_mov_b32_e32 v64, -1 | |
v_mov_b32_e32 v64, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v72, v71 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v71, v72 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB4_47 | |
; BB#48: ; %Flow1177 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB4_49: ; %Flow1178 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_50: ; %Flow1187 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
v_and_b32_e32 v64, 0xff00, v67 | |
v_cmp_ne_u32_e32 vcc, 0, v64 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB4_90 | |
s_cbranch_execz BB4_90 | |
BB4_51: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v74, v52 offset:4 | |
s_mov_b64 s[36:37], s[16:17] | |
s_mov_b64 s[38:39], s[30:31] | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v64, 3, v74 | |
v_add_i32_e32 v64, vcc, v64, v1 | |
v_ashrrev_i32_e32 v65, 31, v64 | |
v_lshl_b64 v[68:69], v[64:65], 4 | |
v_lshl_b64 v[75:76], v[64:65], 3 | |
buffer_load_dwordx4 v[70:73], v[68:69], s[36:39], 0 addr64 | |
s_mov_b64 s[36:37], s[20:21] | |
buffer_load_dwordx2 v[68:69], v[75:76], s[36:39], 0 addr64 | |
v_lshrrev_b32_e32 v65, 8, v67 | |
v_mov_b32_e32 v75, 0 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
s_waitcnt vmcnt(0) | |
; mask branch BB4_55 | |
s_cbranch_execz BB4_55 | |
BB4_52: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset1:1 | |
v_cmp_ne_u32_e32 vcc, v40, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v79, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v75, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v76, v72, v87 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mul_f32_e32 v75, s5, v75 | |
v_cmp_lt_f32_e32 vcc, v81, v75 | |
v_mov_b32_e32 v75, 0 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
; mask branch BB4_54 | |
s_cbranch_execz BB4_54 | |
BB4_53: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v77, 0x34cd15ae, v81 | |
v_rsq_f32_e32 v81, v77 | |
v_mul_f32_e32 v77, v47, v77 | |
v_mul_f32_e32 v82, v77, v77 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_madak_f32_e32 v83, v83, v82, 0x3ded3cb2 | |
v_mov_b32_e32 v84, 0x3c739487 | |
v_madak_f32_e32 v84, v84, v82, 0x3f01e2bc | |
v_mad_f32 v83, v83, v82, 1.0 | |
v_mac_f32_e32 v83, v77, v84 | |
v_mov_b32_e32 v84, 0xb2951928 | |
v_madak_f32_e32 v84, v84, v82, 0xb85ffb93 | |
v_mov_b32_e32 v85, 0x35c55945 | |
v_madak_f32_e32 v85, v85, v82, 0x3a83ca0c | |
v_madak_f32_e32 v84, v84, v82, 0xbc9ded90 | |
v_madak_f32_e32 v85, v85, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v82, v84, v82, 0xbf409397 | |
v_mac_f32_e32 v82, v77, v85 | |
v_rcp_f32_e32 v77, v83 | |
v_lshrrev_b32_e32 v80, 8, v63 | |
v_and_b32_e32 v80, 1, v80 | |
v_cmp_eq_u32_e32 vcc, 1, v80 | |
v_mul_f32_e32 v77, v51, v77 | |
v_mul_f32_e32 v77, v82, v77 | |
v_cndmask_b32_e64 v80, 0, 1.0, vcc | |
v_mul_f32_e32 v82, v81, v81 | |
v_mul_f32_e32 v83, v80, v82 | |
v_mac_f32_e32 v77, v81, v83 | |
v_mul_f32_e32 v81, v82, v82 | |
s_mov_b32 m0, -1 | |
v_mul_f32_e32 v83, v80, v81 | |
ds_read_b64 v[80:81], v54 | |
v_mul_f32_e32 v83, v82, v83 | |
v_mul_f32_e32 v75, v73, v88 | |
v_mac_f32_e32 v46, v0, v65 | |
v_mul_f32_e64 v84, v65, -v0 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v81, v69, v81 | |
v_mul_f32_e64 v80, v68, -v80 | |
v_mac_f32_e32 v80, v83, v81 | |
v_mul_f32_e32 v81, v82, v83 | |
v_mul_f32_e32 v81, v80, v81 | |
v_mac_f32_e32 v81, v77, v75 | |
v_mad_f32 v45, v76, v81, v45 | |
v_mad_f32 v44, v78, v81, v44 | |
v_mad_f32 v43, v79, v81, v43 | |
v_mul_f32_e64 v80, v81, -v76 | |
v_mul_f32_e64 v77, v81, -v78 | |
v_mul_f32_e64 v75, v81, -v79 | |
BB4_54: ; %Flow1175 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_55: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 9, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_59 | |
s_cbranch_execz BB4_59 | |
BB4_56: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:16 offset1:17 | |
v_cmp_ne_u32_e32 vcc, v62, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_58 | |
s_cbranch_execz BB4_58 | |
BB4_57: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 9, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:64 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v37, v79, v81, v37 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v36, v78, v81, v36 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v35, v76, v81, v35 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v38, v0, v65 | |
BB4_58: ; %Flow1174 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_59: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 10, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_63 | |
s_cbranch_execz BB4_63 | |
BB4_60: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:32 offset1:33 | |
v_cmp_ne_u32_e32 vcc, v61, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_62 | |
s_cbranch_execz BB4_62 | |
BB4_61: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 10, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:128 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v29, v79, v81, v29 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v28, v78, v81, v28 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v27, v76, v81, v27 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v30, v0, v65 | |
BB4_62: ; %Flow1173 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_63: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 11, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_67 | |
s_cbranch_execz BB4_67 | |
BB4_64: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:48 offset1:49 | |
v_cmp_ne_u32_e32 vcc, v60, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_66 | |
s_cbranch_execz BB4_66 | |
BB4_65: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 11, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:192 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v25, v79, v81, v25 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v24, v78, v81, v24 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v23, v76, v81, v23 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v26, v0, v65 | |
BB4_66: ; %Flow1172 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_67: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 12, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_71 | |
s_cbranch_execz BB4_71 | |
BB4_68: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:64 offset1:65 | |
v_cmp_ne_u32_e32 vcc, v59, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_70 | |
s_cbranch_execz BB4_70 | |
BB4_69: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 12, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:256 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v21, v79, v81, v21 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v20, v78, v81, v20 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v19, v76, v81, v19 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v22, v0, v65 | |
BB4_70: ; %Flow1171 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_71: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 13, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_75 | |
s_cbranch_execz BB4_75 | |
BB4_72: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:80 offset1:81 | |
v_cmp_ne_u32_e32 vcc, v58, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_74 | |
s_cbranch_execz BB4_74 | |
BB4_73: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 13, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:320 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v17, v79, v81, v17 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v16, v78, v81, v16 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v15, v76, v81, v15 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v18, v0, v65 | |
BB4_74: ; %Flow1170 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_75: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 14, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_79 | |
s_cbranch_execz BB4_79 | |
BB4_76: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:96 offset1:97 | |
v_cmp_ne_u32_e32 vcc, v57, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_78 | |
s_cbranch_execz BB4_78 | |
BB4_77: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 14, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:384 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v13, v79, v81, v13 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v12, v78, v81, v12 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v11, v76, v81, v11 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v14, v0, v65 | |
BB4_78: ; %Flow1169 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_79: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 15, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_83 | |
s_cbranch_execz BB4_83 | |
BB4_80: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:112 offset1:113 | |
v_cmp_ne_u32_e32 vcc, v33, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v71, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v70, v70, v85 | |
v_mul_f32_e32 v74, v71, v71 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mac_f32_e32 v74, v70, v70 | |
v_mac_f32_e32 v74, v72, v72 | |
v_mul_f32_e32 v76, s5, v76 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v74, v76 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_82 | |
s_cbranch_execz BB4_82 | |
BB4_81: ; in Loop: Header=BB4_7 Depth=1 | |
v_mul_f32_e32 v79, v73, v88 | |
s_mov_b32 m0, -1 | |
v_mad_f32 v73, -v65, v0, v84 | |
v_max_f32_e32 v78, 0x34cd15ae, v74 | |
ds_read_b64 v[73:74], v54 offset:448 | |
v_mul_f32_e32 v81, v47, v78 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_lshrrev_b32_e32 v76, 15, v63 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v69, v69, v74 | |
v_mov_b32_e32 v74, 0x3c739487 | |
v_mul_f32_e64 v68, v68, -v73 | |
v_madak_f32_e32 v73, v83, v82, 0x3ded3cb2 | |
v_madak_f32_e32 v74, v74, v82, 0x3f01e2bc | |
v_mad_f32 v73, v73, v82, 1.0 | |
v_mac_f32_e32 v73, v81, v74 | |
v_mov_b32_e32 v74, 0xb2951928 | |
v_rcp_f32_e32 v73, v73 | |
v_madak_f32_e32 v74, v74, v82, 0xb85ffb93 | |
v_mov_b32_e32 v83, 0x35c55945 | |
v_madak_f32_e32 v83, v83, v82, 0x3a83ca0c | |
v_madak_f32_e32 v74, v74, v82, 0xbc9ded90 | |
v_madak_f32_e32 v83, v83, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v74, v74, v82, 0xbf409397 | |
v_mac_f32_e32 v74, v81, v83 | |
v_mul_f32_e32 v73, v51, v73 | |
v_mul_f32_e32 v73, v74, v73 | |
v_rsq_f32_e32 v74, v78 | |
v_and_b32_e32 v76, 1, v76 | |
v_cmp_eq_u32_e32 vcc, 1, v76 | |
v_cndmask_b32_e64 v76, 0, 1.0, vcc | |
v_mul_f32_e32 v78, v74, v74 | |
v_mul_f32_e32 v81, v76, v78 | |
v_mac_f32_e32 v73, v74, v81 | |
v_mul_f32_e32 v74, v78, v78 | |
v_mul_f32_e32 v74, v76, v74 | |
v_mul_f32_e32 v74, v78, v74 | |
v_mac_f32_e32 v68, v74, v69 | |
v_mul_f32_e32 v69, v78, v74 | |
v_mul_f32_e32 v68, v68, v69 | |
v_mac_f32_e32 v68, v73, v79 | |
v_mad_f32 v80, -v68, v72, v80 | |
v_mad_f32 v10, v72, v68, v10 | |
v_mad_f32 v77, -v68, v71, v77 | |
v_mad_f32 v9, v71, v68, v9 | |
v_mad_f32 v75, -v68, v70, v75 | |
v_mad_f32 v8, v70, v68, v8 | |
v_mac_f32_e32 v66, v0, v65 | |
BB4_82: ; %Flow1168 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_83: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v5, v75 | |
ds_write_b32 v6, v77 | |
ds_write_b32 v7, v80 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB4_89 | |
s_cbranch_execz BB4_89 | |
BB4_84: ; in Loop: Header=BB4_7 Depth=1 | |
v_lshlrev_b32_e32 v68, 6, v2 | |
v_add_i32_e32 v65, vcc, v0, v68 | |
v_lshlrev_b32_e32 v65, 2, v65 | |
v_add_i32_e32 v69, vcc, s4, v65 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v65, v69 | |
v_add_i32_e32 v70, vcc, 8, v0 | |
v_or_b32_e32 v71, 1, v0 | |
v_cmp_lt_i32_e32 vcc, v71, v70 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB4_86 | |
s_cbranch_execz BB4_86 | |
BB4_85: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[70:71], v69 offset0:1 offset1:2 | |
v_or_b32_e32 v74, 3, v0 | |
v_add_i32_e32 v68, vcc, v74, v68 | |
v_lshlrev_b32_e32 v68, 2, v68 | |
ds_read2_b32 v[72:73], v69 offset0:3 offset1:4 | |
v_add_i32_e32 v68, vcc, s4, v68 | |
ds_read_b32 v75, v69 offset:28 | |
ds_read2_b32 v[68:69], v68 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v65, v65, v70 | |
v_add_f32_e32 v65, v71, v65 | |
v_add_f32_e32 v65, v72, v65 | |
v_add_f32_e32 v65, v73, v65 | |
v_add_f32_e32 v65, v68, v65 | |
v_add_f32_e32 v65, v69, v65 | |
v_add_f32_e32 v65, v75, v65 | |
BB4_86: ; %._crit_edge.i72 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
v_mul_lo_i32 v64, v64, 3 | |
v_mov_b32_e32 v72, s13 | |
s_mov_b64 s[36:37], s[12:13] | |
s_mov_b64 s[38:39], s[30:31] | |
v_add_i32_e32 v68, vcc, v64, v2 | |
v_ashrrev_i32_e32 v69, 31, v68 | |
v_lshl_b64 v[70:71], v[68:69], 2 | |
v_add_i32_e32 v68, vcc, s12, v70 | |
v_addc_u32_e32 v69, vcc, v71, v72, vcc | |
buffer_load_dword v71, v[70:71], s[36:39], 0 addr64 | |
s_mov_b64 s[22:23], 0 | |
s_waitcnt vmcnt(0) | |
BB4_87: ; Parent Loop BB4_7 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v70, v65, v71 | |
v_mov_b32_e32 v73, v71 | |
v_mov_b32_e32 v72, v70 | |
buffer_atomic_cmpswap v[72:73], v[68:69], s[28:31], 0 addr64 glc | |
v_mov_b32_e32 v64, -1 | |
v_mov_b32_e32 v64, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v72, v71 | |
s_or_b64 s[22:23], vcc, s[22:23] | |
v_mov_b32_e32 v71, v72 | |
s_andn2_b64 exec, exec, s[22:23] | |
s_cbranch_execnz BB4_87 | |
; BB#88: ; %Flow1166 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_89: ; %Flow1167 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_90: ; %Flow1176 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_and_b32_e32 v64, 0xff0000, v67 | |
v_cmp_ne_u32_e32 vcc, 0, v64 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB4_130 | |
s_cbranch_execz BB4_130 | |
BB4_91: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v74, v52 offset:8 | |
s_mov_b64 s[36:37], s[16:17] | |
s_mov_b64 s[38:39], s[30:31] | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v64, 3, v74 | |
v_add_i32_e32 v64, vcc, v64, v1 | |
v_ashrrev_i32_e32 v65, 31, v64 | |
v_lshl_b64 v[68:69], v[64:65], 4 | |
v_lshl_b64 v[75:76], v[64:65], 3 | |
buffer_load_dwordx4 v[70:73], v[68:69], s[36:39], 0 addr64 | |
s_mov_b64 s[36:37], s[20:21] | |
buffer_load_dwordx2 v[68:69], v[75:76], s[36:39], 0 addr64 | |
v_lshrrev_b32_e32 v65, 16, v67 | |
v_mov_b32_e32 v75, 0 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
s_waitcnt vmcnt(0) | |
; mask branch BB4_95 | |
s_cbranch_execz BB4_95 | |
BB4_92: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset1:1 | |
v_cmp_ne_u32_e32 vcc, v40, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v79, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v75, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v76, v72, v87 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mul_f32_e32 v75, s5, v75 | |
v_cmp_lt_f32_e32 vcc, v81, v75 | |
v_mov_b32_e32 v75, 0 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
; mask branch BB4_94 | |
s_cbranch_execz BB4_94 | |
BB4_93: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v77, 0x34cd15ae, v81 | |
v_rsq_f32_e32 v81, v77 | |
v_mul_f32_e32 v77, v47, v77 | |
v_mul_f32_e32 v82, v77, v77 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_madak_f32_e32 v83, v83, v82, 0x3ded3cb2 | |
v_mov_b32_e32 v84, 0x3c739487 | |
v_madak_f32_e32 v84, v84, v82, 0x3f01e2bc | |
v_mad_f32 v83, v83, v82, 1.0 | |
v_mac_f32_e32 v83, v77, v84 | |
v_mov_b32_e32 v84, 0xb2951928 | |
v_madak_f32_e32 v84, v84, v82, 0xb85ffb93 | |
v_mov_b32_e32 v85, 0x35c55945 | |
v_madak_f32_e32 v85, v85, v82, 0x3a83ca0c | |
v_madak_f32_e32 v84, v84, v82, 0xbc9ded90 | |
v_madak_f32_e32 v85, v85, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v82, v84, v82, 0xbf409397 | |
v_mac_f32_e32 v82, v77, v85 | |
v_rcp_f32_e32 v77, v83 | |
v_lshrrev_b32_e32 v80, 16, v63 | |
v_and_b32_e32 v80, 1, v80 | |
v_cmp_eq_u32_e32 vcc, 1, v80 | |
v_mul_f32_e32 v77, v51, v77 | |
v_mul_f32_e32 v77, v82, v77 | |
v_cndmask_b32_e64 v80, 0, 1.0, vcc | |
v_mul_f32_e32 v82, v81, v81 | |
v_mul_f32_e32 v83, v80, v82 | |
v_mac_f32_e32 v77, v81, v83 | |
v_mul_f32_e32 v81, v82, v82 | |
s_mov_b32 m0, -1 | |
v_mul_f32_e32 v83, v80, v81 | |
ds_read_b64 v[80:81], v54 | |
v_mul_f32_e32 v83, v82, v83 | |
v_mul_f32_e32 v75, v73, v88 | |
v_mac_f32_e32 v46, v0, v65 | |
v_mul_f32_e64 v84, v65, -v0 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v81, v69, v81 | |
v_mul_f32_e64 v80, v68, -v80 | |
v_mac_f32_e32 v80, v83, v81 | |
v_mul_f32_e32 v81, v82, v83 | |
v_mul_f32_e32 v81, v80, v81 | |
v_mac_f32_e32 v81, v77, v75 | |
v_mad_f32 v45, v76, v81, v45 | |
v_mad_f32 v44, v78, v81, v44 | |
v_mad_f32 v43, v79, v81, v43 | |
v_mul_f32_e64 v80, v81, -v76 | |
v_mul_f32_e64 v77, v81, -v78 | |
v_mul_f32_e64 v75, v81, -v79 | |
BB4_94: ; %Flow1164 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_95: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 17, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_99 | |
s_cbranch_execz BB4_99 | |
BB4_96: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:16 offset1:17 | |
v_cmp_ne_u32_e32 vcc, v62, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_98 | |
s_cbranch_execz BB4_98 | |
BB4_97: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 17, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:64 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v37, v79, v81, v37 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v36, v78, v81, v36 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v35, v76, v81, v35 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v38, v0, v65 | |
BB4_98: ; %Flow1163 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_99: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 18, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_103 | |
s_cbranch_execz BB4_103 | |
BB4_100: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:32 offset1:33 | |
v_cmp_ne_u32_e32 vcc, v61, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_102 | |
s_cbranch_execz BB4_102 | |
BB4_101: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 18, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:128 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v29, v79, v81, v29 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v28, v78, v81, v28 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v27, v76, v81, v27 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v30, v0, v65 | |
BB4_102: ; %Flow1162 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_103: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 19, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_107 | |
s_cbranch_execz BB4_107 | |
BB4_104: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:48 offset1:49 | |
v_cmp_ne_u32_e32 vcc, v60, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_106 | |
s_cbranch_execz BB4_106 | |
BB4_105: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 19, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:192 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v25, v79, v81, v25 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v24, v78, v81, v24 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v23, v76, v81, v23 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v26, v0, v65 | |
BB4_106: ; %Flow1161 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_107: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 20, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_111 | |
s_cbranch_execz BB4_111 | |
BB4_108: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:64 offset1:65 | |
v_cmp_ne_u32_e32 vcc, v59, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_110 | |
s_cbranch_execz BB4_110 | |
BB4_109: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 20, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:256 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v21, v79, v81, v21 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v20, v78, v81, v20 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v19, v76, v81, v19 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v22, v0, v65 | |
BB4_110: ; %Flow1160 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_111: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 21, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_115 | |
s_cbranch_execz BB4_115 | |
BB4_112: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:80 offset1:81 | |
v_cmp_ne_u32_e32 vcc, v58, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_114 | |
s_cbranch_execz BB4_114 | |
BB4_113: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 21, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:320 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v17, v79, v81, v17 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v16, v78, v81, v16 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v15, v76, v81, v15 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v18, v0, v65 | |
BB4_114: ; %Flow1159 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_115: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 22, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_119 | |
s_cbranch_execz BB4_119 | |
BB4_116: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:96 offset1:97 | |
v_cmp_ne_u32_e32 vcc, v57, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_118 | |
s_cbranch_execz BB4_118 | |
BB4_117: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 22, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:384 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v13, v79, v81, v13 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v12, v78, v81, v12 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v11, v76, v81, v11 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v14, v0, v65 | |
BB4_118: ; %Flow1158 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_119: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 23, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_123 | |
s_cbranch_execz BB4_123 | |
BB4_120: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:112 offset1:113 | |
v_cmp_ne_u32_e32 vcc, v33, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v71, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v70, v70, v85 | |
v_mul_f32_e32 v74, v71, v71 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mac_f32_e32 v74, v70, v70 | |
v_mac_f32_e32 v74, v72, v72 | |
v_mul_f32_e32 v76, s5, v76 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v74, v76 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_122 | |
s_cbranch_execz BB4_122 | |
BB4_121: ; in Loop: Header=BB4_7 Depth=1 | |
v_mul_f32_e32 v79, v73, v88 | |
s_mov_b32 m0, -1 | |
v_mad_f32 v73, -v65, v0, v84 | |
v_max_f32_e32 v78, 0x34cd15ae, v74 | |
ds_read_b64 v[73:74], v54 offset:448 | |
v_mul_f32_e32 v81, v47, v78 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_lshrrev_b32_e32 v76, 23, v63 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v69, v69, v74 | |
v_mov_b32_e32 v74, 0x3c739487 | |
v_mul_f32_e64 v68, v68, -v73 | |
v_madak_f32_e32 v73, v83, v82, 0x3ded3cb2 | |
v_madak_f32_e32 v74, v74, v82, 0x3f01e2bc | |
v_mad_f32 v73, v73, v82, 1.0 | |
v_mac_f32_e32 v73, v81, v74 | |
v_mov_b32_e32 v74, 0xb2951928 | |
v_rcp_f32_e32 v73, v73 | |
v_madak_f32_e32 v74, v74, v82, 0xb85ffb93 | |
v_mov_b32_e32 v83, 0x35c55945 | |
v_madak_f32_e32 v83, v83, v82, 0x3a83ca0c | |
v_madak_f32_e32 v74, v74, v82, 0xbc9ded90 | |
v_madak_f32_e32 v83, v83, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v74, v74, v82, 0xbf409397 | |
v_mac_f32_e32 v74, v81, v83 | |
v_mul_f32_e32 v73, v51, v73 | |
v_mul_f32_e32 v73, v74, v73 | |
v_rsq_f32_e32 v74, v78 | |
v_and_b32_e32 v76, 1, v76 | |
v_cmp_eq_u32_e32 vcc, 1, v76 | |
v_cndmask_b32_e64 v76, 0, 1.0, vcc | |
v_mul_f32_e32 v78, v74, v74 | |
v_mul_f32_e32 v81, v76, v78 | |
v_mac_f32_e32 v73, v74, v81 | |
v_mul_f32_e32 v74, v78, v78 | |
v_mul_f32_e32 v74, v76, v74 | |
v_mul_f32_e32 v74, v78, v74 | |
v_mac_f32_e32 v68, v74, v69 | |
v_mul_f32_e32 v69, v78, v74 | |
v_mul_f32_e32 v68, v68, v69 | |
v_mac_f32_e32 v68, v73, v79 | |
v_mad_f32 v80, -v68, v72, v80 | |
v_mad_f32 v10, v72, v68, v10 | |
v_mad_f32 v77, -v68, v71, v77 | |
v_mad_f32 v9, v71, v68, v9 | |
v_mad_f32 v75, -v68, v70, v75 | |
v_mad_f32 v8, v70, v68, v8 | |
v_mac_f32_e32 v66, v0, v65 | |
BB4_122: ; %Flow1157 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_123: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v5, v75 | |
ds_write_b32 v6, v77 | |
ds_write_b32 v7, v80 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB4_129 | |
s_cbranch_execz BB4_129 | |
BB4_124: ; in Loop: Header=BB4_7 Depth=1 | |
v_lshlrev_b32_e32 v68, 6, v2 | |
v_add_i32_e32 v65, vcc, v0, v68 | |
v_lshlrev_b32_e32 v65, 2, v65 | |
v_add_i32_e32 v69, vcc, s4, v65 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v65, v69 | |
v_add_i32_e32 v70, vcc, 8, v0 | |
v_or_b32_e32 v71, 1, v0 | |
v_cmp_lt_i32_e32 vcc, v71, v70 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB4_126 | |
s_cbranch_execz BB4_126 | |
BB4_125: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[70:71], v69 offset0:1 offset1:2 | |
v_or_b32_e32 v74, 3, v0 | |
v_add_i32_e32 v68, vcc, v74, v68 | |
v_lshlrev_b32_e32 v68, 2, v68 | |
ds_read2_b32 v[72:73], v69 offset0:3 offset1:4 | |
v_add_i32_e32 v68, vcc, s4, v68 | |
ds_read_b32 v75, v69 offset:28 | |
ds_read2_b32 v[68:69], v68 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v65, v65, v70 | |
v_add_f32_e32 v65, v71, v65 | |
v_add_f32_e32 v65, v72, v65 | |
v_add_f32_e32 v65, v73, v65 | |
v_add_f32_e32 v65, v68, v65 | |
v_add_f32_e32 v65, v69, v65 | |
v_add_f32_e32 v65, v75, v65 | |
BB4_126: ; %._crit_edge.i26 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
v_mul_lo_i32 v64, v64, 3 | |
v_mov_b32_e32 v72, s13 | |
s_mov_b64 s[36:37], s[12:13] | |
s_mov_b64 s[38:39], s[30:31] | |
v_add_i32_e32 v68, vcc, v64, v2 | |
v_ashrrev_i32_e32 v69, 31, v68 | |
v_lshl_b64 v[70:71], v[68:69], 2 | |
v_add_i32_e32 v68, vcc, s12, v70 | |
v_addc_u32_e32 v69, vcc, v71, v72, vcc | |
buffer_load_dword v71, v[70:71], s[36:39], 0 addr64 | |
s_mov_b64 s[22:23], 0 | |
s_waitcnt vmcnt(0) | |
BB4_127: ; Parent Loop BB4_7 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v70, v65, v71 | |
v_mov_b32_e32 v73, v71 | |
v_mov_b32_e32 v72, v70 | |
buffer_atomic_cmpswap v[72:73], v[68:69], s[28:31], 0 addr64 glc | |
v_mov_b32_e32 v64, -1 | |
v_mov_b32_e32 v64, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v72, v71 | |
s_or_b64 s[22:23], vcc, s[22:23] | |
v_mov_b32_e32 v71, v72 | |
s_andn2_b64 exec, exec, s[22:23] | |
s_cbranch_execnz BB4_127 | |
; BB#128: ; %Flow1155 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_129: ; %Flow1156 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_130: ; %Flow1165 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_mov_b32_e32 v64, 0xffffff | |
v_cmp_lt_u32_e32 vcc, v64, v67 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB4_170 | |
s_cbranch_execz BB4_170 | |
BB4_131: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v74, v52 offset:12 | |
s_mov_b64 s[36:37], s[16:17] | |
s_mov_b64 s[38:39], s[30:31] | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v64, 3, v74 | |
v_add_i32_e32 v64, vcc, v64, v1 | |
v_ashrrev_i32_e32 v65, 31, v64 | |
v_lshl_b64 v[68:69], v[64:65], 4 | |
v_lshl_b64 v[75:76], v[64:65], 3 | |
buffer_load_dwordx4 v[70:73], v[68:69], s[36:39], 0 addr64 | |
s_mov_b64 s[36:37], s[20:21] | |
buffer_load_dwordx2 v[68:69], v[75:76], s[36:39], 0 addr64 | |
v_lshrrev_b32_e32 v65, 24, v67 | |
v_mov_b32_e32 v75, 0 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
s_waitcnt vmcnt(0) | |
; mask branch BB4_135 | |
s_cbranch_execz BB4_135 | |
BB4_132: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset1:1 | |
v_cmp_ne_u32_e32 vcc, v40, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v79, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v75, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v76, v72, v87 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mul_f32_e32 v75, s5, v75 | |
v_cmp_lt_f32_e32 vcc, v81, v75 | |
v_mov_b32_e32 v75, 0 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
; mask branch BB4_134 | |
s_cbranch_execz BB4_134 | |
BB4_133: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v77, 0x34cd15ae, v81 | |
v_rsq_f32_e32 v81, v77 | |
v_mul_f32_e32 v77, v47, v77 | |
v_mul_f32_e32 v82, v77, v77 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_madak_f32_e32 v83, v83, v82, 0x3ded3cb2 | |
v_mov_b32_e32 v84, 0x3c739487 | |
v_madak_f32_e32 v84, v84, v82, 0x3f01e2bc | |
v_mad_f32 v83, v83, v82, 1.0 | |
v_mac_f32_e32 v83, v77, v84 | |
v_mov_b32_e32 v84, 0xb2951928 | |
v_madak_f32_e32 v84, v84, v82, 0xb85ffb93 | |
v_mov_b32_e32 v85, 0x35c55945 | |
v_madak_f32_e32 v85, v85, v82, 0x3a83ca0c | |
v_madak_f32_e32 v84, v84, v82, 0xbc9ded90 | |
v_madak_f32_e32 v85, v85, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v82, v84, v82, 0xbf409397 | |
v_mac_f32_e32 v82, v77, v85 | |
v_rcp_f32_e32 v77, v83 | |
v_lshrrev_b32_e32 v80, 24, v63 | |
v_and_b32_e32 v80, 1, v80 | |
v_cmp_eq_u32_e32 vcc, 1, v80 | |
v_mul_f32_e32 v77, v51, v77 | |
v_mul_f32_e32 v77, v82, v77 | |
v_cndmask_b32_e64 v80, 0, 1.0, vcc | |
v_mul_f32_e32 v82, v81, v81 | |
v_mul_f32_e32 v83, v80, v82 | |
v_mac_f32_e32 v77, v81, v83 | |
v_mul_f32_e32 v81, v82, v82 | |
s_mov_b32 m0, -1 | |
v_mul_f32_e32 v83, v80, v81 | |
ds_read_b64 v[80:81], v54 | |
v_mul_f32_e32 v83, v82, v83 | |
v_mul_f32_e32 v75, v73, v88 | |
v_mac_f32_e32 v46, v0, v65 | |
v_mul_f32_e64 v84, v65, -v0 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v81, v69, v81 | |
v_mul_f32_e64 v80, v68, -v80 | |
v_mac_f32_e32 v80, v83, v81 | |
v_mul_f32_e32 v81, v82, v83 | |
v_mul_f32_e32 v81, v80, v81 | |
v_mac_f32_e32 v81, v77, v75 | |
v_mad_f32 v45, v76, v81, v45 | |
v_mad_f32 v44, v78, v81, v44 | |
v_mad_f32 v43, v79, v81, v43 | |
v_mul_f32_e64 v80, v81, -v76 | |
v_mul_f32_e64 v77, v81, -v78 | |
v_mul_f32_e64 v75, v81, -v79 | |
BB4_134: ; %Flow1153 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_135: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 25, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_139 | |
s_cbranch_execz BB4_139 | |
BB4_136: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:16 offset1:17 | |
v_cmp_ne_u32_e32 vcc, v62, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_138 | |
s_cbranch_execz BB4_138 | |
BB4_137: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 25, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:64 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v37, v79, v81, v37 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v36, v78, v81, v36 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v35, v76, v81, v35 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v38, v0, v65 | |
BB4_138: ; %Flow1152 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_139: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 26, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_143 | |
s_cbranch_execz BB4_143 | |
BB4_140: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:32 offset1:33 | |
v_cmp_ne_u32_e32 vcc, v61, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_142 | |
s_cbranch_execz BB4_142 | |
BB4_141: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 26, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:128 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v29, v79, v81, v29 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v28, v78, v81, v28 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v27, v76, v81, v27 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v30, v0, v65 | |
BB4_142: ; %Flow1151 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_143: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 27, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_147 | |
s_cbranch_execz BB4_147 | |
BB4_144: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:48 offset1:49 | |
v_cmp_ne_u32_e32 vcc, v60, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_146 | |
s_cbranch_execz BB4_146 | |
BB4_145: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 27, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:192 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v25, v79, v81, v25 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v24, v78, v81, v24 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v23, v76, v81, v23 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v26, v0, v65 | |
BB4_146: ; %Flow1150 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_147: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 28, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_151 | |
s_cbranch_execz BB4_151 | |
BB4_148: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:64 offset1:65 | |
v_cmp_ne_u32_e32 vcc, v59, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_150 | |
s_cbranch_execz BB4_150 | |
BB4_149: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 28, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:256 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v21, v79, v81, v21 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v20, v78, v81, v20 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v19, v76, v81, v19 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v22, v0, v65 | |
BB4_150: ; %Flow1149 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_151: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 29, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_155 | |
s_cbranch_execz BB4_155 | |
BB4_152: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:80 offset1:81 | |
v_cmp_ne_u32_e32 vcc, v58, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_154 | |
s_cbranch_execz BB4_154 | |
BB4_153: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 29, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:320 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v17, v79, v81, v17 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v16, v78, v81, v16 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v15, v76, v81, v15 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v18, v0, v65 | |
BB4_154: ; %Flow1148 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_155: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 30, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_159 | |
s_cbranch_execz BB4_159 | |
BB4_156: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:96 offset1:97 | |
v_cmp_ne_u32_e32 vcc, v57, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s5, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_158 | |
s_cbranch_execz BB4_158 | |
BB4_157: ; in Loop: Header=BB4_7 Depth=1 | |
v_max_f32_e32 v81, 0x34cd15ae, v81 | |
v_mul_f32_e32 v86, v47, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_mul_f32_e32 v88, v86, v86 | |
v_mov_b32_e32 v89, 0x3a92b707 | |
v_madak_f32_e32 v89, v89, v88, 0x3ded3cb2 | |
v_mov_b32_e32 v90, 0x3c739487 | |
v_madak_f32_e32 v90, v90, v88, 0x3f01e2bc | |
v_mad_f32 v89, v89, v88, 1.0 | |
v_mac_f32_e32 v89, v86, v90 | |
v_lshrrev_b32_e32 v82, 30, v63 | |
v_mov_b32_e32 v90, 0xb2951928 | |
v_rsq_f32_e32 v85, v81 | |
v_madak_f32_e32 v90, v90, v88, 0xb85ffb93 | |
v_mov_b32_e32 v91, 0x35c55945 | |
v_rcp_f32_e32 v89, v89 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
ds_read_b64 v[81:82], v54 offset:384 | |
v_madak_f32_e32 v91, v91, v88, 0x3a83ca0c | |
v_madak_f32_e32 v90, v90, v88, 0xbc9ded90 | |
v_madak_f32_e32 v91, v91, v88, 0x3d8eaf3b | |
v_madak_f32_e32 v88, v90, v88, 0xbf409397 | |
v_mac_f32_e32 v88, v86, v91 | |
v_mul_f32_e32 v86, v85, v85 | |
v_mul_f32_e32 v89, v51, v89 | |
v_cndmask_b32_e64 v87, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v86, v86 | |
v_mul_f32_e32 v90, v87, v90 | |
v_mul_f32_e32 v87, v87, v86 | |
v_mul_f32_e32 v88, v88, v89 | |
v_mac_f32_e32 v88, v85, v87 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v85, v86, v90 | |
v_mac_f32_e32 v81, v85, v82 | |
v_mul_f32_e32 v82, v86, v85 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v88, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v13, v79, v81, v13 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v12, v78, v81, v12 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v11, v76, v81, v11 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v14, v0, v65 | |
BB4_158: ; %Flow1147 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_159: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_cmp_gt_i32_e32 vcc, 0, v67 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB4_163 | |
s_cbranch_execz BB4_163 | |
BB4_160: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:112 offset1:113 | |
v_cmp_ne_u32_e32 vcc, v33, v74 | |
s_and_b64 s[22:23], exec, s[2:3] | |
s_or_b64 s[22:23], s[22:23], vcc | |
s_and_b64 s[34:35], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v71, v71, v86 | |
v_subrev_f32_e32 v67, v70, v85 | |
v_mul_f32_e32 v70, v71, v71 | |
s_or_b64 s[22:23], s[34:35], s[22:23] | |
v_cndmask_b32_e64 v74, 0, 1.0, s[22:23] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mac_f32_e32 v70, v67, v67 | |
v_mac_f32_e32 v70, v72, v72 | |
v_mul_f32_e32 v74, s5, v74 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v70, v74 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
; mask branch BB4_162 | |
s_cbranch_execz BB4_162 | |
BB4_161: ; in Loop: Header=BB4_7 Depth=1 | |
v_mul_f32_e32 v76, v73, v88 | |
s_mov_b32 m0, -1 | |
v_mad_f32 v73, -v65, v0, v84 | |
ds_read_b64 v[73:74], v54 offset:448 | |
v_max_f32_e32 v70, 0x34cd15ae, v70 | |
v_mov_b32_e32 v78, 0x3a92b707 | |
v_mov_b32_e32 v79, 0x3c739487 | |
v_mov_b32_e32 v81, 0x35c55945 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e64 v68, v68, -v73 | |
v_mul_f32_e32 v73, v47, v70 | |
v_mul_f32_e32 v69, v69, v74 | |
v_mul_f32_e32 v74, v73, v73 | |
v_madak_f32_e32 v78, v78, v74, 0x3ded3cb2 | |
v_madak_f32_e32 v79, v79, v74, 0x3f01e2bc | |
v_mad_f32 v78, v78, v74, 1.0 | |
v_mac_f32_e32 v78, v73, v79 | |
v_mov_b32_e32 v79, 0xb2951928 | |
v_madak_f32_e32 v79, v79, v74, 0xb85ffb93 | |
v_madak_f32_e32 v81, v81, v74, 0x3a83ca0c | |
v_madak_f32_e32 v79, v79, v74, 0xbc9ded90 | |
v_madak_f32_e32 v81, v81, v74, 0x3d8eaf3b | |
v_madak_f32_e32 v74, v79, v74, 0xbf409397 | |
v_mac_f32_e32 v74, v73, v81 | |
v_rcp_f32_e32 v73, v78 | |
v_rsq_f32_e32 v70, v70 | |
v_cmp_gt_i32_e32 vcc, 0, v63 | |
v_cndmask_b32_e64 v63, 0, 1.0, vcc | |
v_mul_f32_e32 v73, v51, v73 | |
v_mul_f32_e32 v73, v74, v73 | |
v_mul_f32_e32 v74, v70, v70 | |
v_mul_f32_e32 v78, v63, v74 | |
v_mac_f32_e32 v73, v70, v78 | |
v_mul_f32_e32 v70, v74, v74 | |
v_mul_f32_e32 v63, v63, v70 | |
v_mul_f32_e32 v63, v74, v63 | |
v_mac_f32_e32 v68, v63, v69 | |
v_mul_f32_e32 v63, v74, v63 | |
v_mul_f32_e32 v63, v68, v63 | |
v_mac_f32_e32 v63, v73, v76 | |
v_mad_f32 v80, -v63, v72, v80 | |
v_mad_f32 v10, v72, v63, v10 | |
v_mad_f32 v77, -v63, v71, v77 | |
v_mad_f32 v9, v71, v63, v9 | |
v_mad_f32 v75, -v63, v67, v75 | |
v_mad_f32 v8, v67, v63, v8 | |
v_mac_f32_e32 v66, v0, v65 | |
BB4_162: ; %Flow1146 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_163: ; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v5, v75 | |
ds_write_b32 v6, v77 | |
ds_write_b32 v7, v80 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB4_169 | |
s_cbranch_execz BB4_169 | |
BB4_164: ; in Loop: Header=BB4_7 Depth=1 | |
v_lshlrev_b32_e32 v65, 6, v2 | |
v_add_i32_e32 v63, vcc, v0, v65 | |
v_lshlrev_b32_e32 v63, 2, v63 | |
v_add_i32_e32 v67, vcc, s4, v63 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v63, v67 | |
v_add_i32_e32 v68, vcc, 8, v0 | |
v_or_b32_e32 v69, 1, v0 | |
v_cmp_lt_i32_e32 vcc, v69, v68 | |
s_and_saveexec_b64 s[22:23], vcc | |
s_xor_b64 s[22:23], exec, s[22:23] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB4_166 | |
s_cbranch_execz BB4_166 | |
BB4_165: ; in Loop: Header=BB4_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[68:69], v67 offset0:1 offset1:2 | |
v_or_b32_e32 v72, 3, v0 | |
v_add_i32_e32 v65, vcc, v72, v65 | |
v_lshlrev_b32_e32 v65, 2, v65 | |
ds_read2_b32 v[70:71], v67 offset0:3 offset1:4 | |
v_add_i32_e32 v65, vcc, s4, v65 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v63, v63, v68 | |
ds_read_b32 v73, v67 offset:28 | |
ds_read2_b32 v[67:68], v65 offset0:2 offset1:3 | |
v_add_f32_e32 v63, v69, v63 | |
v_add_f32_e32 v63, v70, v63 | |
v_add_f32_e32 v63, v71, v63 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v63, v67, v63 | |
v_add_f32_e32 v63, v68, v63 | |
v_add_f32_e32 v63, v73, v63 | |
BB4_166: ; %._crit_edge.i | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
v_mul_lo_i32 v64, v64, 3 | |
v_mov_b32_e32 v68, s13 | |
s_mov_b64 s[36:37], s[12:13] | |
s_mov_b64 s[38:39], s[30:31] | |
v_add_i32_e32 v64, vcc, v64, v2 | |
v_ashrrev_i32_e32 v65, 31, v64 | |
v_lshl_b64 v[64:65], v[64:65], 2 | |
v_add_i32_e32 v67, vcc, s12, v64 | |
v_addc_u32_e32 v68, vcc, v65, v68, vcc | |
buffer_load_dword v65, v[64:65], s[36:39], 0 addr64 | |
s_mov_b64 s[22:23], 0 | |
s_waitcnt vmcnt(0) | |
BB4_167: ; Parent Loop BB4_7 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_mov_b32_e32 v64, -1 | |
v_add_f32_e32 v64, v63, v65 | |
v_mov_b32_e32 v70, v65 | |
v_mov_b32_e32 v69, v64 | |
buffer_atomic_cmpswap v[69:70], v[67:68], s[28:31], 0 addr64 glc | |
v_mov_b32_e32 v64, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v69, v65 | |
s_or_b64 s[22:23], vcc, s[22:23] | |
v_mov_b32_e32 v65, v69 | |
s_andn2_b64 exec, exec, s[22:23] | |
s_cbranch_execnz BB4_167 | |
; BB#168: ; %Flow1144 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB4_169: ; %Flow1145 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB4_170: ; %Flow1154 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB4_171: ; %Flow1188 | |
; in Loop: Header=BB4_7 Depth=1 | |
s_or_b64 exec, exec, s[32:33] | |
v_add_i32_e32 v55, vcc, 1, v55 | |
v_addc_u32_e32 v56, vcc, 0, v56, vcc | |
v_cmp_ne_u32_e32 vcc, v55, v34 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB4_7 | |
BB4_172: ; %Flow1191 | |
s_mov_b32 m0, -1 | |
ds_write_b32 v5, v43 | |
ds_write_b32 v6, v44 | |
ds_write_b32 v7, v45 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_load_dword s0, s[6:7], 0x32 | |
v_cmp_ne_u32_e32 vcc, 22, v32 | |
v_lshlrev_b32_e32 v18, 2, v39 | |
v_mov_b32_e32 v3, 0 | |
v_lshlrev_b32_e32 v14, 6, v31 | |
s_waitcnt lgkmcnt(0) | |
v_cmp_ne_u32_e64 s[0:1], s0, 0 | |
s_and_b64 s[2:3], s[0:1], vcc | |
v_add_i32_e32 v18, vcc, s4, v18 | |
v_add_i32_e32 v26, vcc, 64, v2 | |
v_add_i32_e32 v22, vcc, 0x80, v2 | |
v_cmp_gt_i32_e64 s[0:1], 4, v1 | |
s_and_saveexec_b64 s[6:7], s[0:1] | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB4_183 | |
s_cbranch_execz BB4_183 | |
BB4_173: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v3, v18 offset:128 | |
ds_read_b32 v30, v18 | |
v_add_i32_e32 v31, vcc, v0, v26 | |
v_lshlrev_b32_e32 v31, 2, v31 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v30 | |
ds_write_b32 v18, v3 | |
v_add_i32_e32 v30, vcc, s4, v31 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v3, v30 offset:128 | |
ds_read_b32 v31, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v31 | |
ds_write_b32 v18, v3 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v3, vcc, v0, v22 | |
v_lshlrev_b32_e32 v3, 2, v3 | |
v_add_i32_e32 v31, vcc, s4, v3 | |
ds_read_b32 v3, v31 offset:128 | |
ds_read_b32 v32, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v32 | |
v_mov_b32_e32 v32, 0 | |
ds_write_b32 v18, v3 offset:512 | |
s_waitcnt lgkmcnt(0) | |
; implicit-def: %VGPR3 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_175 | |
BB4_174: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_mov_b32_e32 v3, 0 | |
v_cndmask_b32_e64 v32, 0, -1, vcc | |
BB4_175: ; %Flow1141 | |
s_or_saveexec_b64 s[10:11], s[10:11] | |
s_xor_b64 exec, exec, s[10:11] | |
; mask branch BB4_177 | |
s_cbranch_execz BB4_177 | |
BB4_176: ; %.thread85.i | |
s_mov_b32 m0, -1 | |
ds_read_b32 v32, v18 offset:64 | |
ds_read_b32 v33, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v32, v32, v33 | |
ds_write_b32 v18, v32 | |
ds_read_b32 v30, v30 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v32, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v30, v32 | |
ds_write_b32 v18, v30 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v30, v31 offset:64 | |
ds_read_b32 v31, v18 offset:512 | |
v_mov_b32_e32 v32, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v30, v31 | |
ds_write_b32 v18, v30 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB4_177: ; %Flow1142 | |
s_or_b64 exec, exec, s[10:11] | |
v_cmp_ne_u32_e32 vcc, 0, v32 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_182 | |
s_cbranch_execz BB4_182 | |
BB4_178: | |
v_mov_b32_e32 v30, 0xe0 | |
v_mad_i32_i24 v30, v30, v1, v18 | |
s_mov_b32 m0, -1 | |
v_add_i32_e32 v3, vcc, v14, v2 | |
ds_read_b32 v31, v30 | |
ds_read_b32 v30, v30 offset:32 | |
v_mul_lo_i32 v3, v3, 3 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_mov_b64 s[16:17], s[12:13] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v31, v30 | |
v_add_i32_e32 v31, vcc, v1, v3 | |
v_ashrrev_i32_e32 v32, 31, v31 | |
v_lshl_b64 v[33:34], v[31:32], 2 | |
v_add_i32_e32 v31, vcc, s12, v33 | |
v_mov_b32_e32 v3, s13 | |
v_addc_u32_e32 v32, vcc, v34, v3, vcc | |
buffer_load_dword v34, v[33:34], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB4_179: ; =>This Inner Loop Header: Depth=1 | |
v_add_f32_e32 v33, v30, v34 | |
v_mov_b32_e32 v39, v34 | |
v_mov_b32_e32 v38, v33 | |
buffer_atomic_cmpswap v[38:39], v[31:32], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v3, -1 | |
v_mov_b32_e32 v3, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v38, v34 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v34, v38 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB4_179 | |
; BB#180: ; %atomicAdd_g_f.exit.i | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v31, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v31 | |
v_mov_b32_e32 v3, 0 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB4_182 | |
; BB#181: | |
v_mov_b32_e32 v3, v30 | |
BB4_182: ; %Flow1143 | |
s_or_b64 exec, exec, s[10:11] | |
BB4_183: ; %reduce_force_i_pow2.exit | |
s_or_b64 exec, exec, s[6:7] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v35 | |
ds_write_b32 v6, v36 | |
ds_write_b32 v7, v37 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[6:7], s[0:1] | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB4_194 | |
s_cbranch_execz BB4_194 | |
BB4_184: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v30, v18 offset:128 | |
ds_read_b32 v31, v18 | |
v_add_i32_e32 v32, vcc, v0, v26 | |
v_lshlrev_b32_e32 v32, 2, v32 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v30, v31 | |
ds_write_b32 v18, v30 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v30, vcc, s4, v32 | |
ds_read_b32 v31, v30 offset:128 | |
ds_read_b32 v32, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v31, v31, v32 | |
ds_write_b32 v18, v31 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v31, vcc, v0, v22 | |
v_lshlrev_b32_e32 v31, 2, v31 | |
v_add_i32_e32 v31, vcc, s4, v31 | |
ds_read_b32 v32, v31 offset:128 | |
ds_read_b32 v33, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v32, v32, v33 | |
ds_write_b32 v18, v32 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v32, 0 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_186 | |
BB4_185: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v32, 0, -1, vcc | |
BB4_186: ; %Flow1138 | |
s_or_saveexec_b64 s[10:11], s[10:11] | |
s_xor_b64 exec, exec, s[10:11] | |
; mask branch BB4_188 | |
s_cbranch_execz BB4_188 | |
BB4_187: ; %.thread85.i491 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v32, v18 offset:64 | |
ds_read_b32 v33, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v32, v32, v33 | |
ds_write_b32 v18, v32 | |
ds_read_b32 v30, v30 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v32, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v30, v32 | |
ds_write_b32 v18, v30 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v30, v31 offset:64 | |
ds_read_b32 v31, v18 offset:512 | |
v_mov_b32_e32 v32, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v30, v31 | |
ds_write_b32 v18, v30 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB4_188: ; %Flow1139 | |
s_or_b64 exec, exec, s[10:11] | |
v_cmp_ne_u32_e32 vcc, 0, v32 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_193 | |
s_cbranch_execz BB4_193 | |
BB4_189: | |
v_or_b32_e32 v30, 8, v14 | |
v_add_i32_e32 v30, vcc, v30, v2 | |
v_mul_lo_i32 v31, v30, 3 | |
v_mov_b32_e32 v30, 0xe0 | |
v_mad_i32_i24 v30, v30, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v32, v30 | |
ds_read_b32 v30, v30 offset:32 | |
v_add_i32_e32 v31, vcc, v1, v31 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_mov_b64 s[16:17], s[12:13] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v32, v30 | |
v_ashrrev_i32_e32 v32, 31, v31 | |
v_lshl_b64 v[33:34], v[31:32], 2 | |
v_add_i32_e32 v31, vcc, s12, v33 | |
v_mov_b32_e32 v32, s13 | |
v_addc_u32_e32 v32, vcc, v34, v32, vcc | |
buffer_load_dword v34, v[33:34], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB4_190: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v33, -1 | |
v_add_f32_e32 v33, v30, v34 | |
v_mov_b32_e32 v36, v34 | |
v_mov_b32_e32 v35, v33 | |
buffer_atomic_cmpswap v[35:36], v[31:32], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v33, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v35, v34 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v34, v35 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB4_190 | |
; BB#191: ; %atomicAdd_g_f.exit.i479 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v31, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v31 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB4_193 | |
; BB#192: | |
v_add_f32_e32 v3, v30, v3 | |
BB4_193: ; %Flow1140 | |
s_or_b64 exec, exec, s[10:11] | |
BB4_194: ; %reduce_force_i_pow2.exit493 | |
s_or_b64 exec, exec, s[6:7] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v27 | |
ds_write_b32 v6, v28 | |
ds_write_b32 v7, v29 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[6:7], s[0:1] | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB4_205 | |
s_cbranch_execz BB4_205 | |
BB4_195: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v27, v18 offset:128 | |
ds_read_b32 v28, v18 | |
v_add_i32_e32 v29, vcc, v0, v26 | |
v_lshlrev_b32_e32 v29, 2, v29 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v28 | |
ds_write_b32 v18, v27 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v27, vcc, s4, v29 | |
ds_read_b32 v28, v27 offset:128 | |
ds_read_b32 v29, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v28, v28, v29 | |
ds_write_b32 v18, v28 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v28, vcc, v0, v22 | |
v_lshlrev_b32_e32 v28, 2, v28 | |
v_add_i32_e32 v28, vcc, s4, v28 | |
ds_read_b32 v29, v28 offset:128 | |
ds_read_b32 v30, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v29, v29, v30 | |
ds_write_b32 v18, v29 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v29, 0 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_197 | |
BB4_196: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v29, 0, -1, vcc | |
BB4_197: ; %Flow1135 | |
s_or_saveexec_b64 s[10:11], s[10:11] | |
s_xor_b64 exec, exec, s[10:11] | |
; mask branch BB4_199 | |
s_cbranch_execz BB4_199 | |
BB4_198: ; %.thread85.i442 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v29, v18 offset:64 | |
ds_read_b32 v30, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v29, v29, v30 | |
ds_write_b32 v18, v29 | |
ds_read_b32 v27, v27 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v29, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v29 | |
ds_write_b32 v18, v27 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v27, v28 offset:64 | |
ds_read_b32 v28, v18 offset:512 | |
v_mov_b32_e32 v29, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v28 | |
ds_write_b32 v18, v27 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB4_199: ; %Flow1136 | |
s_or_b64 exec, exec, s[10:11] | |
v_cmp_ne_u32_e32 vcc, 0, v29 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_204 | |
s_cbranch_execz BB4_204 | |
BB4_200: | |
v_or_b32_e32 v27, 16, v14 | |
v_add_i32_e32 v27, vcc, v27, v2 | |
v_mul_lo_i32 v28, v27, 3 | |
v_mov_b32_e32 v27, 0xe0 | |
v_mad_i32_i24 v27, v27, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v29, v27 | |
ds_read_b32 v27, v27 offset:32 | |
v_add_i32_e32 v28, vcc, v1, v28 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_mov_b64 s[16:17], s[12:13] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v29, v27 | |
v_ashrrev_i32_e32 v29, 31, v28 | |
v_lshl_b64 v[30:31], v[28:29], 2 | |
v_add_i32_e32 v28, vcc, s12, v30 | |
v_mov_b32_e32 v29, s13 | |
v_addc_u32_e32 v29, vcc, v31, v29, vcc | |
buffer_load_dword v31, v[30:31], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB4_201: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v30, -1 | |
v_add_f32_e32 v30, v27, v31 | |
v_mov_b32_e32 v33, v31 | |
v_mov_b32_e32 v32, v30 | |
buffer_atomic_cmpswap v[32:33], v[28:29], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v30, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v32, v31 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v31, v32 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB4_201 | |
; BB#202: ; %atomicAdd_g_f.exit.i430 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v28, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v28 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB4_204 | |
; BB#203: | |
v_add_f32_e32 v3, v27, v3 | |
BB4_204: ; %Flow1137 | |
s_or_b64 exec, exec, s[10:11] | |
BB4_205: ; %reduce_force_i_pow2.exit444 | |
s_or_b64 exec, exec, s[6:7] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v23 | |
ds_write_b32 v6, v24 | |
ds_write_b32 v7, v25 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[6:7], s[0:1] | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB4_216 | |
s_cbranch_execz BB4_216 | |
BB4_206: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v23, v18 offset:128 | |
ds_read_b32 v24, v18 | |
v_add_i32_e32 v25, vcc, v0, v26 | |
v_lshlrev_b32_e32 v25, 2, v25 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v23, v23, v24 | |
ds_write_b32 v18, v23 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v23, vcc, s4, v25 | |
ds_read_b32 v24, v23 offset:128 | |
ds_read_b32 v25, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v24, v24, v25 | |
ds_write_b32 v18, v24 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v24, vcc, v0, v22 | |
v_lshlrev_b32_e32 v24, 2, v24 | |
v_add_i32_e32 v24, vcc, s4, v24 | |
ds_read_b32 v25, v24 offset:128 | |
ds_read_b32 v27, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v25, v25, v27 | |
ds_write_b32 v18, v25 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v25, 0 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_208 | |
BB4_207: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v25, 0, -1, vcc | |
BB4_208: ; %Flow1132 | |
s_or_saveexec_b64 s[10:11], s[10:11] | |
s_xor_b64 exec, exec, s[10:11] | |
; mask branch BB4_210 | |
s_cbranch_execz BB4_210 | |
BB4_209: ; %.thread85.i393 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v25, v18 offset:64 | |
ds_read_b32 v27, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v25, v25, v27 | |
ds_write_b32 v18, v25 | |
ds_read_b32 v23, v23 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v25, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v23, v23, v25 | |
ds_write_b32 v18, v23 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v23, v24 offset:64 | |
ds_read_b32 v24, v18 offset:512 | |
v_mov_b32_e32 v25, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v23, v23, v24 | |
ds_write_b32 v18, v23 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB4_210: ; %Flow1133 | |
s_or_b64 exec, exec, s[10:11] | |
v_cmp_ne_u32_e32 vcc, 0, v25 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_215 | |
s_cbranch_execz BB4_215 | |
BB4_211: | |
v_or_b32_e32 v23, 24, v14 | |
v_add_i32_e32 v23, vcc, v23, v2 | |
v_mul_lo_i32 v24, v23, 3 | |
v_mov_b32_e32 v23, 0xe0 | |
v_mad_i32_i24 v23, v23, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v25, v23 | |
ds_read_b32 v23, v23 offset:32 | |
v_add_i32_e32 v24, vcc, v1, v24 | |
v_mov_b32_e32 v28, s13 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v23, v25, v23 | |
v_ashrrev_i32_e32 v25, 31, v24 | |
v_lshl_b64 v[24:25], v[24:25], 2 | |
v_add_i32_e32 v27, vcc, s12, v24 | |
s_mov_b64 s[16:17], s[12:13] | |
v_addc_u32_e32 v28, vcc, v25, v28, vcc | |
buffer_load_dword v25, v[24:25], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB4_212: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v24, -1 | |
v_add_f32_e32 v24, v23, v25 | |
v_mov_b32_e32 v30, v25 | |
v_mov_b32_e32 v29, v24 | |
buffer_atomic_cmpswap v[29:30], v[27:28], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v24, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v29, v25 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v25, v29 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB4_212 | |
; BB#213: ; %atomicAdd_g_f.exit.i381 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v24, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v24 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB4_215 | |
; BB#214: | |
v_add_f32_e32 v3, v23, v3 | |
BB4_215: ; %Flow1134 | |
s_or_b64 exec, exec, s[10:11] | |
BB4_216: ; %reduce_force_i_pow2.exit395 | |
s_or_b64 exec, exec, s[6:7] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v19 | |
ds_write_b32 v6, v20 | |
ds_write_b32 v7, v21 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[6:7], s[0:1] | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB4_227 | |
s_cbranch_execz BB4_227 | |
BB4_217: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v19, v18 offset:128 | |
ds_read_b32 v20, v18 | |
v_add_i32_e32 v21, vcc, v0, v26 | |
v_lshlrev_b32_e32 v21, 2, v21 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v19, v19, v20 | |
ds_write_b32 v18, v19 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v19, vcc, s4, v21 | |
ds_read_b32 v20, v19 offset:128 | |
ds_read_b32 v21, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v20, v20, v21 | |
ds_write_b32 v18, v20 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v20, vcc, v0, v22 | |
v_lshlrev_b32_e32 v20, 2, v20 | |
v_add_i32_e32 v20, vcc, s4, v20 | |
ds_read_b32 v21, v20 offset:128 | |
ds_read_b32 v23, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v21, v21, v23 | |
ds_write_b32 v18, v21 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v21, 0 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_219 | |
BB4_218: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v21, 0, -1, vcc | |
BB4_219: ; %Flow1129 | |
s_or_saveexec_b64 s[10:11], s[10:11] | |
s_xor_b64 exec, exec, s[10:11] | |
; mask branch BB4_221 | |
s_cbranch_execz BB4_221 | |
BB4_220: ; %.thread85.i344 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v21, v18 offset:64 | |
ds_read_b32 v23, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v21, v21, v23 | |
ds_write_b32 v18, v21 | |
ds_read_b32 v19, v19 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v21, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v19, v19, v21 | |
ds_write_b32 v18, v19 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v19, v20 offset:64 | |
ds_read_b32 v20, v18 offset:512 | |
v_mov_b32_e32 v21, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v19, v19, v20 | |
ds_write_b32 v18, v19 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB4_221: ; %Flow1130 | |
s_or_b64 exec, exec, s[10:11] | |
v_cmp_ne_u32_e32 vcc, 0, v21 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_226 | |
s_cbranch_execz BB4_226 | |
BB4_222: | |
v_or_b32_e32 v19, 32, v14 | |
v_add_i32_e32 v19, vcc, v19, v2 | |
v_mul_lo_i32 v20, v19, 3 | |
v_mov_b32_e32 v19, 0xe0 | |
v_mad_i32_i24 v19, v19, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v21, v19 | |
ds_read_b32 v19, v19 offset:32 | |
v_add_i32_e32 v20, vcc, v1, v20 | |
v_mov_b32_e32 v23, s13 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v19, v21, v19 | |
v_ashrrev_i32_e32 v21, 31, v20 | |
v_lshl_b64 v[20:21], v[20:21], 2 | |
v_add_i32_e32 v27, vcc, s12, v20 | |
s_mov_b64 s[16:17], s[12:13] | |
v_addc_u32_e32 v28, vcc, v21, v23, vcc | |
buffer_load_dword v21, v[20:21], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB4_223: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v20, -1 | |
v_add_f32_e32 v20, v19, v21 | |
v_mov_b32_e32 v24, v21 | |
v_mov_b32_e32 v23, v20 | |
buffer_atomic_cmpswap v[23:24], v[27:28], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v20, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v23, v21 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v21, v23 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB4_223 | |
; BB#224: ; %atomicAdd_g_f.exit.i332 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v20, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v20 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB4_226 | |
; BB#225: | |
v_add_f32_e32 v3, v19, v3 | |
BB4_226: ; %Flow1131 | |
s_or_b64 exec, exec, s[10:11] | |
BB4_227: ; %reduce_force_i_pow2.exit346 | |
s_or_b64 exec, exec, s[6:7] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v15 | |
ds_write_b32 v6, v16 | |
ds_write_b32 v7, v17 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[6:7], s[0:1] | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB4_238 | |
s_cbranch_execz BB4_238 | |
BB4_228: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v15, v18 offset:128 | |
ds_read_b32 v16, v18 | |
v_add_i32_e32 v17, vcc, v0, v26 | |
v_lshlrev_b32_e32 v17, 2, v17 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v15, v15, v16 | |
ds_write_b32 v18, v15 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v15, vcc, s4, v17 | |
ds_read_b32 v16, v15 offset:128 | |
ds_read_b32 v17, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v16, v16, v17 | |
ds_write_b32 v18, v16 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v16, vcc, v0, v22 | |
v_lshlrev_b32_e32 v16, 2, v16 | |
v_add_i32_e32 v16, vcc, s4, v16 | |
ds_read_b32 v17, v16 offset:128 | |
ds_read_b32 v19, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v17, v17, v19 | |
ds_write_b32 v18, v17 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v17, 0 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_230 | |
BB4_229: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v17, 0, -1, vcc | |
BB4_230: ; %Flow1126 | |
s_or_saveexec_b64 s[10:11], s[10:11] | |
s_xor_b64 exec, exec, s[10:11] | |
; mask branch BB4_232 | |
s_cbranch_execz BB4_232 | |
BB4_231: ; %.thread85.i295 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v17, v18 offset:64 | |
ds_read_b32 v19, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v17, v17, v19 | |
ds_write_b32 v18, v17 | |
ds_read_b32 v15, v15 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v17, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v15, v15, v17 | |
ds_write_b32 v18, v15 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v15, v16 offset:64 | |
ds_read_b32 v16, v18 offset:512 | |
v_mov_b32_e32 v17, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v15, v15, v16 | |
ds_write_b32 v18, v15 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB4_232: ; %Flow1127 | |
s_or_b64 exec, exec, s[10:11] | |
v_cmp_ne_u32_e32 vcc, 0, v17 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_237 | |
s_cbranch_execz BB4_237 | |
BB4_233: | |
v_or_b32_e32 v15, 40, v14 | |
v_add_i32_e32 v15, vcc, v15, v2 | |
v_mul_lo_i32 v16, v15, 3 | |
v_mov_b32_e32 v15, 0xe0 | |
v_mad_i32_i24 v15, v15, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v17, v15 | |
ds_read_b32 v15, v15 offset:32 | |
v_add_i32_e32 v16, vcc, v1, v16 | |
v_mov_b32_e32 v19, s13 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v15, v17, v15 | |
v_ashrrev_i32_e32 v17, 31, v16 | |
v_lshl_b64 v[16:17], v[16:17], 2 | |
v_add_i32_e32 v27, vcc, s12, v16 | |
s_mov_b64 s[16:17], s[12:13] | |
v_addc_u32_e32 v28, vcc, v17, v19, vcc | |
buffer_load_dword v17, v[16:17], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB4_234: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v16, -1 | |
v_add_f32_e32 v16, v15, v17 | |
v_mov_b32_e32 v20, v17 | |
v_mov_b32_e32 v19, v16 | |
buffer_atomic_cmpswap v[19:20], v[27:28], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v16, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v19, v17 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v17, v19 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB4_234 | |
; BB#235: ; %atomicAdd_g_f.exit.i283 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v16, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v16 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB4_237 | |
; BB#236: | |
v_add_f32_e32 v3, v15, v3 | |
BB4_237: ; %Flow1128 | |
s_or_b64 exec, exec, s[10:11] | |
BB4_238: ; %reduce_force_i_pow2.exit297 | |
s_or_b64 exec, exec, s[6:7] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v11 | |
ds_write_b32 v6, v12 | |
ds_write_b32 v7, v13 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[6:7], s[0:1] | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB4_249 | |
s_cbranch_execz BB4_249 | |
BB4_239: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v11, v18 offset:128 | |
ds_read_b32 v12, v18 | |
v_add_i32_e32 v13, vcc, v0, v26 | |
v_lshlrev_b32_e32 v13, 2, v13 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v11, v11, v12 | |
ds_write_b32 v18, v11 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v11, vcc, s4, v13 | |
ds_read_b32 v12, v11 offset:128 | |
ds_read_b32 v13, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v12, v12, v13 | |
ds_write_b32 v18, v12 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v12, vcc, v0, v22 | |
v_lshlrev_b32_e32 v12, 2, v12 | |
v_add_i32_e32 v12, vcc, s4, v12 | |
ds_read_b32 v13, v12 offset:128 | |
ds_read_b32 v15, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v13, v13, v15 | |
ds_write_b32 v18, v13 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v13, 0 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_241 | |
BB4_240: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v13, 0, -1, vcc | |
BB4_241: ; %Flow1123 | |
s_or_saveexec_b64 s[10:11], s[10:11] | |
s_xor_b64 exec, exec, s[10:11] | |
; mask branch BB4_243 | |
s_cbranch_execz BB4_243 | |
BB4_242: ; %.thread85.i246 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v13, v18 offset:64 | |
ds_read_b32 v15, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v13, v13, v15 | |
ds_write_b32 v18, v13 | |
ds_read_b32 v11, v11 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v13, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v11, v11, v13 | |
ds_write_b32 v18, v11 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v11, v12 offset:64 | |
ds_read_b32 v12, v18 offset:512 | |
v_mov_b32_e32 v13, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v11, v11, v12 | |
ds_write_b32 v18, v11 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB4_243: ; %Flow1124 | |
s_or_b64 exec, exec, s[10:11] | |
v_cmp_ne_u32_e32 vcc, 0, v13 | |
s_and_saveexec_b64 s[10:11], vcc | |
s_xor_b64 s[10:11], exec, s[10:11] | |
; mask branch BB4_248 | |
s_cbranch_execz BB4_248 | |
BB4_244: | |
v_or_b32_e32 v11, 48, v14 | |
v_add_i32_e32 v11, vcc, v11, v2 | |
v_mul_lo_i32 v12, v11, 3 | |
v_mov_b32_e32 v11, 0xe0 | |
v_mad_i32_i24 v11, v11, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v13, v11 | |
ds_read_b32 v11, v11 offset:32 | |
v_add_i32_e32 v12, vcc, v1, v12 | |
v_mov_b32_e32 v15, s13 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v11, v13, v11 | |
v_ashrrev_i32_e32 v13, 31, v12 | |
v_lshl_b64 v[12:13], v[12:13], 2 | |
v_add_i32_e32 v27, vcc, s12, v12 | |
s_mov_b64 s[16:17], s[12:13] | |
v_addc_u32_e32 v28, vcc, v13, v15, vcc | |
buffer_load_dword v13, v[12:13], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB4_245: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v12, -1 | |
v_add_f32_e32 v12, v11, v13 | |
v_mov_b32_e32 v16, v13 | |
v_mov_b32_e32 v15, v12 | |
buffer_atomic_cmpswap v[15:16], v[27:28], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v12, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v15, v13 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v13, v15 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB4_245 | |
; BB#246: ; %atomicAdd_g_f.exit.i234 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v12, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v12 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB4_248 | |
; BB#247: | |
v_add_f32_e32 v3, v11, v3 | |
BB4_248: ; %Flow1125 | |
s_or_b64 exec, exec, s[10:11] | |
BB4_249: ; %reduce_force_i_pow2.exit248 | |
s_or_b64 exec, exec, s[6:7] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v8 | |
ds_write_b32 v6, v9 | |
ds_write_b32 v7, v10 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[6:7], s[0:1] | |
s_xor_b64 s[0:1], exec, s[6:7] | |
; mask branch BB4_260 | |
s_cbranch_execz BB4_260 | |
BB4_250: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v5, v18 offset:128 | |
ds_read_b32 v6, v18 | |
v_add_i32_e32 v7, vcc, v0, v26 | |
v_lshlrev_b32_e32 v7, 2, v7 | |
v_add_i32_e32 v0, vcc, v0, v22 | |
v_lshlrev_b32_e32 v0, 2, v0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v5, v5, v6 | |
ds_write_b32 v18, v5 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v5, vcc, s4, v7 | |
ds_read_b32 v6, v5 offset:128 | |
ds_read_b32 v7, v18 offset:256 | |
v_add_i32_e32 v0, vcc, s4, v0 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v6, v6, v7 | |
ds_write_b32 v18, v6 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v6, v0 offset:128 | |
ds_read_b32 v7, v18 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v6, v6, v7 | |
ds_write_b32 v18, v6 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v6, 0 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB4_252 | |
BB4_251: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v6, 0, -1, vcc | |
BB4_252: ; %Flow1120 | |
s_or_saveexec_b64 s[4:5], s[4:5] | |
s_xor_b64 exec, exec, s[4:5] | |
; mask branch BB4_254 | |
s_cbranch_execz BB4_254 | |
BB4_253: ; %.thread85.i197 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v6, v18 offset:64 | |
ds_read_b32 v7, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v6, v6, v7 | |
ds_write_b32 v18, v6 | |
ds_read_b32 v5, v5 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v6, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v5, v5, v6 | |
ds_write_b32 v18, v5 offset:256 | |
ds_read_b32 v0, v0 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v5, v18 offset:512 | |
v_mov_b32_e32 v6, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v0, v0, v5 | |
ds_write_b32 v18, v0 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB4_254: ; %Flow1121 | |
s_or_b64 exec, exec, s[4:5] | |
v_cmp_ne_u32_e32 vcc, 0, v6 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB4_259 | |
s_cbranch_execz BB4_259 | |
BB4_255: | |
v_or_b32_e32 v0, 56, v14 | |
v_add_i32_e32 v0, vcc, v0, v2 | |
v_mul_lo_i32 v2, v0, 3 | |
v_mov_b32_e32 v0, 0xe0 | |
v_mad_i32_i24 v0, v0, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v5, v0 | |
ds_read_b32 v0, v0 offset:32 | |
s_mov_b32 s15, 0xf000 | |
s_mov_b32 s14, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v0, v5, v0 | |
v_add_i32_e32 v5, vcc, v1, v2 | |
v_ashrrev_i32_e32 v6, 31, v5 | |
v_lshl_b64 v[7:8], v[5:6], 2 | |
v_add_i32_e32 v5, vcc, s12, v7 | |
v_mov_b32_e32 v2, s13 | |
v_addc_u32_e32 v6, vcc, v8, v2, vcc | |
buffer_load_dword v8, v[7:8], s[12:15], 0 addr64 | |
s_mov_b64 s[12:13], 0 | |
s_mov_b64 s[6:7], s[12:13] | |
s_waitcnt vmcnt(0) | |
BB4_256: ; =>This Inner Loop Header: Depth=1 | |
v_add_f32_e32 v7, v0, v8 | |
v_mov_b32_e32 v10, v8 | |
v_mov_b32_e32 v9, v7 | |
buffer_atomic_cmpswap v[9:10], v[5:6], s[12:15], 0 addr64 glc | |
v_mov_b32_e32 v2, -1 | |
v_mov_b32_e32 v2, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v9, v8 | |
s_or_b64 s[6:7], vcc, s[6:7] | |
v_mov_b32_e32 v8, v9 | |
s_andn2_b64 exec, exec, s[6:7] | |
s_cbranch_execnz BB4_256 | |
; BB#257: ; %atomicAdd_g_f.exit.i185 | |
s_or_b64 exec, exec, s[6:7] | |
s_and_b64 s[6:7], exec, s[2:3] | |
v_cndmask_b32_e64 v2, 0, 1, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 1, v2 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB4_259 | |
; BB#258: | |
v_add_f32_e32 v3, v0, v3 | |
BB4_259: ; %Flow1122 | |
s_or_b64 exec, exec, s[4:5] | |
BB4_260: ; %reduce_force_i_pow2.exit199 | |
s_or_b64 exec, exec, s[0:1] | |
s_barrier | |
v_cmp_gt_u32_e32 vcc, 3, v1 | |
s_and_b64 s[0:1], exec, s[2:3] | |
s_and_b64 s[0:1], vcc, s[0:1] | |
s_and_saveexec_b64 s[2:3], s[0:1] | |
s_xor_b64 s[0:1], exec, s[2:3] | |
; mask branch BB4_264 | |
s_cbranch_execz BB4_264 | |
BB4_261: | |
v_add_i32_e32 v0, vcc, v4, v1 | |
v_mov_b32_e32 v1, 0 | |
v_lshl_b64 v[0:1], v[0:1], 2 | |
v_add_i32_e32 v4, vcc, s8, v0 | |
v_mov_b32_e32 v2, s9 | |
s_mov_b32 s11, 0xf000 | |
s_mov_b32 s10, 0 | |
v_addc_u32_e32 v5, vcc, v1, v2, vcc | |
buffer_load_dword v1, v[0:1], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_mov_b64 s[2:3], s[8:9] | |
s_waitcnt vmcnt(0) | |
BB4_262: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v0, -1 | |
v_add_f32_e32 v0, v3, v1 | |
v_mov_b32_e32 v7, v1 | |
v_mov_b32_e32 v6, v0 | |
buffer_atomic_cmpswap v[6:7], v[4:5], s[8:11], 0 addr64 glc | |
v_mov_b32_e32 v0, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v6, v1 | |
s_or_b64 s[2:3], vcc, s[2:3] | |
v_mov_b32_e32 v1, v6 | |
s_andn2_b64 exec, exec, s[2:3] | |
s_cbranch_execnz BB4_262 | |
; BB#263: ; %Flow | |
s_or_b64 exec, exec, s[2:3] | |
BB4_264: ; %Flow1119 | |
s_or_b64 exec, exec, s[0:1] | |
s_endpgm | |
.Lfunc_end4: | |
.size nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl, .Lfunc_end4-nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl | |
.section .AMDGPU.csdata | |
; Kernel info: | |
; codeLenInByte = 20948 | |
; NumSgprs: 42 | |
; NumVgprs: 92 | |
; FloatMode: 192 | |
; IeeeMode: 1 | |
; ScratchSize: 0 | |
; LDSByteSize: 0 bytes/workgroup (compile time only) | |
; SGPRBlocks: 5 | |
; VGPRBlocks: 22 | |
; NumSGPRsForWavesPerEU: 42 | |
; NumVGPRsForWavesPerEU: 92 | |
; ReservedVGPRFirst: 0 | |
; ReservedVGPRCount: 0 | |
; COMPUTE_PGM_RSRC2:USER_SGPR: 8 | |
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 | |
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 | |
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 | |
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 1 | |
.section .AMDGPU.config | |
.long 47176 | |
.long 11272534 | |
.long 47180 | |
.long 2192 | |
.long 47200 | |
.long 0 | |
.long 4 | |
.long 0 | |
.long 8 | |
.long 0 | |
.text | |
.globl nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl | |
.p2align 8 | |
.type nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl,@function | |
.amdgpu_hsa_kernel nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl | |
nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl: ; @nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl | |
.amd_kernel_code_t | |
amd_code_version_major = 1 | |
amd_code_version_minor = 0 | |
amd_machine_kind = 1 | |
amd_machine_version_major = 7 | |
amd_machine_version_minor = 0 | |
amd_machine_version_stepping = 1 | |
kernel_code_entry_byte_offset = 256 | |
kernel_code_prefetch_byte_size = 0 | |
max_scratch_backing_memory_byte_size = 0 | |
granulated_workitem_vgpr_count = 22 | |
granulated_wavefront_sgpr_count = 5 | |
priority = 0 | |
float_mode = 192 | |
priv = 0 | |
enable_dx10_clamp = 1 | |
debug_mode = 0 | |
enable_ieee_mode = 1 | |
enable_sgpr_private_segment_wave_byte_offset = 0 | |
user_sgpr_count = 8 | |
enable_sgpr_workgroup_id_x = 1 | |
enable_sgpr_workgroup_id_y = 0 | |
enable_sgpr_workgroup_id_z = 0 | |
enable_sgpr_workgroup_info = 0 | |
enable_vgpr_workitem_id = 1 | |
enable_exception_msb = 0 | |
granulated_lds_size = 0 | |
enable_exception = 0 | |
enable_sgpr_private_segment_buffer = 1 | |
enable_sgpr_dispatch_ptr = 1 | |
enable_sgpr_queue_ptr = 0 | |
enable_sgpr_kernarg_segment_ptr = 1 | |
enable_sgpr_dispatch_id = 0 | |
enable_sgpr_flat_scratch_init = 0 | |
enable_sgpr_private_segment_size = 0 | |
enable_sgpr_grid_workgroup_count_x = 0 | |
enable_sgpr_grid_workgroup_count_y = 0 | |
enable_sgpr_grid_workgroup_count_z = 0 | |
enable_ordered_append_gds = 0 | |
private_element_size = 1 | |
is_ptr64 = 1 | |
is_dynamic_callstack = 0 | |
is_debug_enabled = 0 | |
is_xnack_enabled = 0 | |
workitem_private_segment_byte_size = 0 | |
workgroup_group_segment_byte_size = 0 | |
gds_segment_byte_size = 0 | |
kernarg_segment_byte_size = 232 | |
workgroup_fbarrier_count = 0 | |
wavefront_sgpr_count = 46 | |
workitem_vgpr_count = 92 | |
reserved_vgpr_first = 0 | |
reserved_vgpr_count = 0 | |
reserved_sgpr_first = 0 | |
reserved_sgpr_count = 0 | |
debug_wavefront_private_segment_offset_sgpr = 0 | |
debug_private_segment_buffer_sgpr = 0 | |
kernarg_segment_alignment = 4 | |
group_segment_alignment = 4 | |
private_segment_alignment = 4 | |
wavefront_size = 6 | |
call_convention = 0 | |
runtime_loader_kernel_symbol = 0 | |
.end_amd_kernel_code_t | |
; BB#0: | |
s_load_dwordx2 s[0:1], s[6:7], 0x2c | |
s_mov_b32 s9, 0 | |
s_lshl_b64 s[10:11], s[8:9], 4 | |
v_mov_b32_e32 v3, s10 | |
s_mov_b32 s2, s9 | |
s_mov_b32 s3, 0xf000 | |
v_mov_b32_e32 v4, s11 | |
s_waitcnt lgkmcnt(0) | |
buffer_load_dwordx4 v[31:34], v[3:4], s[0:3], 0 addr64 | |
v_mov_b32_e32 v2, v0 | |
s_load_dwordx2 s[8:9], s[6:7], 0x24 | |
s_load_dwordx2 s[16:17], s[6:7], 0x18 | |
s_mov_b64 s[18:19], s[2:3] | |
s_mov_b64 s[10:11], s[2:3] | |
s_load_dword s14, s[6:7], 0x33 | |
s_load_dword s0, s[6:7], 0x2 | |
s_load_dwordx2 s[20:21], s[6:7], 0x22 | |
s_mov_b32 m0, -1 | |
s_mov_b64 s[22:23], s[2:3] | |
s_load_dword s1, s[4:5], 0x1 | |
s_waitcnt lgkmcnt(0) | |
s_add_i32 s4, s14, 0x420 | |
s_waitcnt vmcnt(0) | |
v_lshlrev_b32_e32 v40, 3, v31 | |
v_mul_lo_i32 v4, v32, 3 | |
v_add_i32_e32 v0, vcc, v1, v40 | |
v_lshlrev_b32_e32 v0, 3, v0 | |
v_add_i32_e32 v9, vcc, v2, v0 | |
v_ashrrev_i32_e32 v10, 31, v9 | |
v_ashrrev_i32_e32 v5, 31, v4 | |
v_lshl_b64 v[11:12], v[4:5], 2 | |
v_lshl_b64 v[6:7], v[9:10], 4 | |
buffer_load_dwordx4 v[5:8], v[6:7], s[16:19], 0 addr64 | |
buffer_load_dwordx2 v[13:14], v[11:12], s[8:11], 0 addr64 | |
buffer_load_dword v0, v[11:12], s[8:11], 0 addr64 offset:8 | |
s_waitcnt vmcnt(1) | |
v_add_f32_e32 v11, v5, v13 | |
s_waitcnt vmcnt(0) | |
v_add_f32_e32 v5, v7, v0 | |
v_lshlrev_b32_e32 v0, 3, v1 | |
v_add_i32_e32 v39, vcc, v2, v0 | |
v_lshlrev_b32_e32 v3, 4, v39 | |
v_add_f32_e32 v12, v6, v14 | |
v_mul_f32_e32 v6, s0, v8 | |
v_add_i32_e32 v3, vcc, s14, v3 | |
ds_write2_b64 v3, v[11:12], v[5:6] offset1:1 | |
s_waitcnt lgkmcnt(0) | |
v_lshl_b64 v[5:6], v[9:10], 3 | |
buffer_load_dwordx2 v[5:6], v[5:6], s[20:23], 0 addr64 | |
s_and_b32 s0, s1, 0xffff | |
v_mad_u32_u24 v52, s0, v1, v2 | |
v_lshlrev_b32_e32 v7, 3, v39 | |
v_add_i32_e32 v7, vcc, s4, v7 | |
v_or_b32_e32 v3, 32, v52 | |
v_lshrrev_b32_e32 v41, 5, v52 | |
v_cmp_eq_u32_e32 vcc, 32, v3 | |
s_waitcnt vmcnt(0) | |
ds_write_b64 v7, v[5:6] | |
s_and_saveexec_b64 s[0:1], vcc | |
s_xor_b64 s[0:1], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB5_2 | |
BB5_1: | |
v_lshlrev_b32_e32 v3, 2, v41 | |
v_add_i32_e32 v3, vcc, s14, v3 | |
v_mov_b32_e32 v5, 0 | |
s_mov_b32 m0, -1 | |
ds_write_b32 v3, v5 offset:2336 | |
s_waitcnt lgkmcnt(0) | |
BB5_2: ; %.preheader456605 | |
s_or_b64 exec, exec, s[0:1] | |
s_barrier | |
s_load_dwordx2 s[12:13], s[6:7], 0x1a | |
v_cmp_lt_i32_e32 vcc, v33, v34 | |
s_and_b64 vcc, exec, vcc | |
v_mov_b32_e32 v12, -1 | |
s_waitcnt lgkmcnt(0) | |
s_mov_b64 vcc, vcc | |
s_cbranch_vccnz BB5_4 | |
; BB#3: ; %.preheader456605.._crit_edge_crit_edge | |
v_mov_b32_e32 v43, 0 | |
v_lshlrev_b32_e32 v3, 2, v52 | |
v_mov_b32_e32 v44, v43 | |
v_mov_b32_e32 v45, v43 | |
v_add_i32_e32 v3, vcc, s14, v3 | |
v_mov_b32_e32 v8, v43 | |
v_add_i32_e32 v5, vcc, 0x620, v3 | |
v_add_i32_e32 v6, vcc, 0x720, v3 | |
v_add_i32_e32 v7, vcc, 0x820, v3 | |
v_mov_b32_e32 v12, 0 | |
v_mov_b32_e32 v9, v44 | |
v_mov_b32_e32 v10, v45 | |
v_mov_b32_e32 v11, v46 | |
s_branch BB5_5 | |
BB5_4: | |
; implicit-def: %VGPR43_VGPR44_VGPR45_VGPR46 | |
; implicit-def: %VGPR5 | |
; implicit-def: %VGPR6 | |
; implicit-def: %VGPR7 | |
; implicit-def: %VGPR8_VGPR9_VGPR10_VGPR11 | |
BB5_5: ; %Flow1190 | |
s_load_dwordx2 s[8:9], s[6:7], 0x20 | |
v_cmp_ne_u32_e32 vcc, 0, v12 | |
v_cndmask_b32_e64 v11, 0, 1, vcc | |
v_cmp_ne_u32_e32 vcc, 1, v11 | |
v_mov_b32_e32 v35, v43 | |
v_mov_b32_e32 v27, v43 | |
v_mov_b32_e32 v23, v43 | |
v_mov_b32_e32 v19, v43 | |
v_mov_b32_e32 v15, v43 | |
v_mov_b32_e32 v11, v43 | |
s_movk_i32 s5, 0x620 | |
v_mov_b32_e32 v3, 0 | |
s_add_i32 s10, s14, s5 | |
s_and_b64 vcc, exec, vcc | |
v_mov_b32_e32 v36, v44 | |
v_mov_b32_e32 v37, v45 | |
v_mov_b32_e32 v38, v46 | |
v_mov_b32_e32 v28, v44 | |
v_mov_b32_e32 v29, v45 | |
v_mov_b32_e32 v30, v46 | |
v_mov_b32_e32 v24, v44 | |
v_mov_b32_e32 v25, v45 | |
v_mov_b32_e32 v26, v46 | |
v_mov_b32_e32 v20, v44 | |
v_mov_b32_e32 v21, v45 | |
v_mov_b32_e32 v22, v46 | |
v_mov_b32_e32 v16, v44 | |
v_mov_b32_e32 v17, v45 | |
v_mov_b32_e32 v18, v46 | |
v_mov_b32_e32 v12, v44 | |
v_mov_b32_e32 v13, v45 | |
v_mov_b32_e32 v14, v46 | |
s_waitcnt lgkmcnt(0) | |
s_mov_b64 vcc, vcc | |
s_cbranch_vccnz BB5_172 | |
; BB#6: ; %.lr.ph | |
v_or_b32_e32 v5, 4, v1 | |
v_cmp_eq_u32_e32 vcc, 4, v5 | |
v_cmp_gt_u32_e64 s[0:1], 4, v2 | |
s_and_b64 s[30:31], s[0:1], vcc | |
v_add_i32_e32 v5, vcc, v1, v2 | |
v_and_b32_e32 v8, 4, v1 | |
s_add_i32 s18, s14, 0x400 | |
v_lshlrev_b32_e32 v5, 2, v5 | |
v_lshlrev_b32_e32 v8, 2, v8 | |
v_add_i32_e32 v50, vcc, s18, v5 | |
v_lshlrev_b32_e32 v5, 2, v52 | |
v_and_b32_e32 v48, 31, v52 | |
v_add_i32_e32 v52, vcc, s18, v8 | |
v_lshlrev_b32_e32 v8, 4, v2 | |
s_load_dword s15, s[6:7], 0x5 | |
v_add_i32_e32 v53, vcc, s14, v8 | |
v_lshlrev_b32_e32 v8, 3, v2 | |
v_add_i32_e32 v54, vcc, s4, v8 | |
v_mov_b32_e32 v8, 0 | |
s_load_dwordx2 s[24:25], s[6:7], 0x30 | |
s_load_dword s11, s[6:7], 0x9 | |
s_load_dword s26, s[6:7], 0xa | |
s_load_dwordx2 s[28:29], s[6:7], 0x2e | |
v_mov_b32_e32 v9, v8 | |
v_mov_b32_e32 v10, v8 | |
v_mov_b32_e32 v14, v11 | |
v_mov_b32_e32 v13, v10 | |
v_mov_b32_e32 v12, v9 | |
v_mov_b32_e32 v11, v8 | |
v_add_i32_e32 v7, vcc, s14, v5 | |
v_mov_b32_e32 v18, v11 | |
v_mov_b32_e32 v22, v11 | |
v_mov_b32_e32 v26, v11 | |
v_mov_b32_e32 v30, v11 | |
v_mov_b32_e32 v38, v11 | |
v_mov_b32_e32 v46, v11 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e64 v47, s15, s15 | |
v_mov_b32_e32 v42, 0 | |
s_mov_b32 s34, 0 | |
v_mov_b32_e32 v49, v42 | |
v_cmp_gt_u32_e64 s[0:1], v1, v2 | |
v_cmp_ne_u32_e64 s[2:3], 22, v32 | |
v_mul_f32_e32 v51, s15, v47 | |
v_add_i32_e32 v5, vcc, s5, v7 | |
v_add_i32_e32 v6, vcc, 0x720, v7 | |
v_add_i32_e32 v7, vcc, 0x820, v7 | |
s_mov_b32 s35, 0xf000 | |
s_mov_b64 s[32:33], 0 | |
v_ashrrev_i32_e32 v56, 31, v33 | |
v_mov_b32_e32 v55, v33 | |
v_or_b32_e32 v33, 7, v40 | |
v_or_b32_e32 v57, 6, v40 | |
v_or_b32_e32 v58, 5, v40 | |
v_or_b32_e32 v59, 4, v40 | |
v_or_b32_e32 v60, 3, v40 | |
v_or_b32_e32 v61, 2, v40 | |
v_or_b32_e32 v62, 1, v40 | |
v_mov_b32_e32 v17, v10 | |
v_mov_b32_e32 v16, v9 | |
v_mov_b32_e32 v15, v8 | |
v_mov_b32_e32 v21, v10 | |
v_mov_b32_e32 v20, v9 | |
v_mov_b32_e32 v19, v8 | |
v_mov_b32_e32 v25, v10 | |
v_mov_b32_e32 v24, v9 | |
v_mov_b32_e32 v23, v8 | |
v_mov_b32_e32 v29, v10 | |
v_mov_b32_e32 v28, v9 | |
v_mov_b32_e32 v27, v8 | |
v_mov_b32_e32 v37, v10 | |
v_mov_b32_e32 v36, v9 | |
v_mov_b32_e32 v35, v8 | |
v_mov_b32_e32 v45, v10 | |
v_mov_b32_e32 v44, v9 | |
v_mov_b32_e32 v43, v8 | |
; implicit-def: %VGPR63_VGPR64_VGPR65_VGPR66 | |
BB5_7: ; =>This Loop Header: Depth=1 | |
; Child Loop BB5_47 Depth 2 | |
; Child Loop BB5_87 Depth 2 | |
; Child Loop BB5_127 Depth 2 | |
; Child Loop BB5_167 Depth 2 | |
v_lshl_b64 v[63:64], v[55:56], 5 | |
v_add_i32_e32 v67, vcc, s28, v63 | |
v_mov_b32_e32 v63, s29 | |
v_addc_u32_e32 v64, vcc, v64, v63, vcc | |
v_lshl_b64 v[68:69], v[41:42], 3 | |
v_add_i32_e32 v67, vcc, v67, v68 | |
v_addc_u32_e32 v68, vcc, v64, v69, vcc | |
buffer_load_dwordx2 v[67:68], v[67:68], s[32:35], 0 addr64 offset:16 | |
s_waitcnt vmcnt(0) | |
v_cmp_ne_u32_e32 vcc, 0, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[36:37], exec, s[4:5] | |
; mask branch BB5_171 | |
s_cbranch_execz BB5_171 | |
BB5_8: ; in Loop: Header=BB5_7 Depth=1 | |
v_ashrrev_i32_e32 v69, 31, v68 | |
v_lshl_b64 v[63:64], v[68:69], 7 | |
v_add_i32_e32 v68, vcc, s24, v63 | |
v_mov_b32_e32 v63, s25 | |
v_addc_u32_e32 v64, vcc, v64, v63, vcc | |
v_lshl_b64 v[69:70], v[48:49], 2 | |
v_add_i32_e32 v68, vcc, v68, v69 | |
v_addc_u32_e32 v69, vcc, v64, v70, vcc | |
buffer_load_dword v63, v[68:69], s[32:35], 0 addr64 | |
s_and_saveexec_b64 s[4:5], s[30:31] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt vmcnt(0) | |
; mask branch BB5_10 | |
s_cbranch_execz BB5_10 | |
BB5_9: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshl_b64 v[64:65], v[55:56], 5 | |
v_add_i32_e32 v68, vcc, s28, v64 | |
v_mov_b32_e32 v64, s29 | |
v_addc_u32_e32 v65, vcc, v65, v64, vcc | |
v_lshl_b64 v[69:70], v[2:3], 2 | |
v_add_i32_e32 v68, vcc, v68, v69 | |
v_addc_u32_e32 v69, vcc, v65, v70, vcc | |
buffer_load_dword v64, v[68:69], s[32:35], 0 addr64 | |
s_mov_b32 m0, -1 | |
s_waitcnt vmcnt(0) | |
ds_write_b32 v50, v64 | |
s_waitcnt lgkmcnt(0) | |
BB5_10: ; %.preheader.preheader | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_and_b32_e32 v64, 0xff, v67 | |
v_cmp_ne_u32_e32 vcc, 0, v64 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[38:39], exec, s[4:5] | |
; mask branch BB5_50 | |
s_cbranch_execz BB5_50 | |
BB5_11: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v74, v52 | |
s_mov_b64 s[18:19], s[34:35] | |
s_mov_b64 s[22:23], s[34:35] | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v64, 3, v74 | |
v_add_i32_e32 v64, vcc, v64, v1 | |
v_ashrrev_i32_e32 v65, 31, v64 | |
v_lshl_b64 v[68:69], v[64:65], 4 | |
v_lshl_b64 v[75:76], v[64:65], 3 | |
buffer_load_dwordx4 v[70:73], v[68:69], s[16:19], 0 addr64 | |
buffer_load_dwordx2 v[68:69], v[75:76], s[20:23], 0 addr64 | |
v_mov_b32_e32 v75, 0 | |
v_and_b32_e32 v65, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[14:15], exec, s[4:5] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
s_waitcnt vmcnt(0) | |
; mask branch BB5_15 | |
s_cbranch_execz BB5_15 | |
BB5_12: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset1:1 | |
v_cmp_ne_u32_e32 vcc, v40, v74 | |
s_and_b64 s[4:5], exec, s[2:3] | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_and_b64 s[18:19], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[4:5], s[18:19], s[4:5] | |
v_subrev_f32_e32 v79, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v75, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v76, v72, v87 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mul_f32_e32 v75, s11, v75 | |
v_cmp_lt_f32_e32 vcc, v81, v75 | |
v_mov_b32_e32 v75, 0 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[18:19], exec, s[4:5] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
; mask branch BB5_14 | |
s_cbranch_execz BB5_14 | |
BB5_13: ; in Loop: Header=BB5_7 Depth=1 | |
v_max_f32_e32 v77, 0x34cd15ae, v81 | |
v_mul_f32_e32 v81, v47, v77 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_madak_f32_e32 v83, v83, v82, 0x3ded3cb2 | |
v_mov_b32_e32 v84, 0x3c739487 | |
v_madak_f32_e32 v84, v84, v82, 0x3f01e2bc | |
v_mad_f32 v83, v83, v82, 1.0 | |
v_mac_f32_e32 v83, v81, v84 | |
v_mov_b32_e32 v84, 0xb2951928 | |
v_madak_f32_e32 v84, v84, v82, 0xb85ffb93 | |
v_mov_b32_e32 v85, 0x35c55945 | |
v_madak_f32_e32 v85, v85, v82, 0x3a83ca0c | |
v_madak_f32_e32 v84, v84, v82, 0xbc9ded90 | |
v_madak_f32_e32 v85, v85, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v82, v84, v82, 0xbf409397 | |
v_cmp_gt_f32_e32 vcc, s26, v77 | |
v_mac_f32_e32 v82, v81, v85 | |
v_rcp_f32_e32 v81, v83 | |
v_rsq_f32_e32 v77, v77 | |
v_and_b32_e32 v80, 1, v63 | |
v_cmp_eq_u32_e64 s[4:5], 1, v80 | |
v_mul_f32_e32 v81, v51, v81 | |
v_cndmask_b32_e64 v80, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v83, v77, v77 | |
v_mul_f32_e32 v82, v82, v81 | |
v_mul_f32_e32 v81, v80, v83 | |
v_mac_f32_e32 v82, v77, v81 | |
v_mul_f32_e32 v77, v83, v83 | |
s_mov_b32 m0, -1 | |
v_mul_f32_e32 v77, v80, v77 | |
ds_read_b64 v[80:81], v54 | |
v_mul_f32_e32 v77, v83, v77 | |
v_mul_f32_e32 v75, v73, v88 | |
v_mac_f32_e32 v46, v0, v65 | |
v_mul_f32_e64 v84, v65, -v0 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v81, v69, v81 | |
v_mul_f32_e64 v80, v68, -v80 | |
v_mac_f32_e32 v80, v77, v81 | |
v_cndmask_b32_e64 v81, 0, 1.0, vcc | |
v_mul_f32_e32 v81, v81, v83 | |
v_mul_f32_e32 v77, v77, v81 | |
v_mul_f32_e32 v81, v80, v77 | |
v_mac_f32_e32 v81, v82, v75 | |
v_mad_f32 v45, v76, v81, v45 | |
v_mad_f32 v44, v78, v81, v44 | |
v_mad_f32 v43, v79, v81, v43 | |
v_mul_f32_e64 v80, v81, -v76 | |
v_mul_f32_e64 v77, v81, -v78 | |
v_mul_f32_e64 v75, v81, -v79 | |
BB5_14: ; %Flow1186 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_15: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_lshrrev_b32_e32 v65, 1, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_19 | |
s_cbranch_execz BB5_19 | |
BB5_16: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:16 offset1:17 | |
v_cmp_ne_u32_e32 vcc, v62, v74 | |
s_and_b64 s[14:15], exec, s[2:3] | |
s_or_b64 s[14:15], s[14:15], vcc | |
s_and_b64 s[18:19], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[14:15], s[18:19], s[14:15] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[14:15] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB5_18 | |
s_cbranch_execz BB5_18 | |
BB5_17: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 1, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:64 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v37, v79, v81, v37 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v36, v78, v81, v36 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v35, v76, v81, v35 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v38, v0, v65 | |
BB5_18: ; %Flow1185 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB5_19: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 2, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_23 | |
s_cbranch_execz BB5_23 | |
BB5_20: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:32 offset1:33 | |
v_cmp_ne_u32_e32 vcc, v61, v74 | |
s_and_b64 s[14:15], exec, s[2:3] | |
s_or_b64 s[14:15], s[14:15], vcc | |
s_and_b64 s[18:19], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[14:15], s[18:19], s[14:15] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[14:15] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB5_22 | |
s_cbranch_execz BB5_22 | |
BB5_21: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 2, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:128 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v29, v79, v81, v29 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v28, v78, v81, v28 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v27, v76, v81, v27 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v30, v0, v65 | |
BB5_22: ; %Flow1184 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB5_23: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 3, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_27 | |
s_cbranch_execz BB5_27 | |
BB5_24: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:48 offset1:49 | |
v_cmp_ne_u32_e32 vcc, v60, v74 | |
s_and_b64 s[14:15], exec, s[2:3] | |
s_or_b64 s[14:15], s[14:15], vcc | |
s_and_b64 s[18:19], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[14:15], s[18:19], s[14:15] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[14:15] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB5_26 | |
s_cbranch_execz BB5_26 | |
BB5_25: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 3, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:192 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v25, v79, v81, v25 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v24, v78, v81, v24 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v23, v76, v81, v23 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v26, v0, v65 | |
BB5_26: ; %Flow1183 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB5_27: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 4, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_31 | |
s_cbranch_execz BB5_31 | |
BB5_28: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:64 offset1:65 | |
v_cmp_ne_u32_e32 vcc, v59, v74 | |
s_and_b64 s[14:15], exec, s[2:3] | |
s_or_b64 s[14:15], s[14:15], vcc | |
s_and_b64 s[18:19], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[14:15], s[18:19], s[14:15] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[14:15] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB5_30 | |
s_cbranch_execz BB5_30 | |
BB5_29: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 4, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:256 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v21, v79, v81, v21 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v20, v78, v81, v20 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v19, v76, v81, v19 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v22, v0, v65 | |
BB5_30: ; %Flow1182 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB5_31: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 5, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_35 | |
s_cbranch_execz BB5_35 | |
BB5_32: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:80 offset1:81 | |
v_cmp_ne_u32_e32 vcc, v58, v74 | |
s_and_b64 s[14:15], exec, s[2:3] | |
s_or_b64 s[14:15], s[14:15], vcc | |
s_and_b64 s[18:19], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[14:15], s[18:19], s[14:15] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[14:15] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB5_34 | |
s_cbranch_execz BB5_34 | |
BB5_33: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 5, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:320 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v17, v79, v81, v17 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v16, v78, v81, v16 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v15, v76, v81, v15 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v18, v0, v65 | |
BB5_34: ; %Flow1181 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB5_35: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 6, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_39 | |
s_cbranch_execz BB5_39 | |
BB5_36: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:96 offset1:97 | |
v_cmp_ne_u32_e32 vcc, v57, v74 | |
s_and_b64 s[14:15], exec, s[2:3] | |
s_or_b64 s[14:15], s[14:15], vcc | |
s_and_b64 s[18:19], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[14:15], s[18:19], s[14:15] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[14:15] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB5_38 | |
s_cbranch_execz BB5_38 | |
BB5_37: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 6, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:384 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v13, v79, v81, v13 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v12, v78, v81, v12 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v11, v76, v81, v11 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v14, v0, v65 | |
BB5_38: ; %Flow1180 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB5_39: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 7, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_43 | |
s_cbranch_execz BB5_43 | |
BB5_40: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:112 offset1:113 | |
v_cmp_ne_u32_e32 vcc, v33, v74 | |
s_and_b64 s[14:15], exec, s[2:3] | |
s_or_b64 s[14:15], s[14:15], vcc | |
s_and_b64 s[18:19], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v71, v71, v86 | |
s_or_b64 s[14:15], s[18:19], s[14:15] | |
v_subrev_f32_e32 v70, v70, v85 | |
v_mul_f32_e32 v74, v71, v71 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[14:15] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mac_f32_e32 v74, v70, v70 | |
v_mac_f32_e32 v74, v72, v72 | |
v_mul_f32_e32 v76, s11, v76 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v74, v76 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
; mask branch BB5_42 | |
s_cbranch_execz BB5_42 | |
BB5_41: ; in Loop: Header=BB5_7 Depth=1 | |
v_mul_f32_e32 v79, v73, v88 | |
s_mov_b32 m0, -1 | |
v_mad_f32 v73, -v65, v0, v84 | |
v_max_f32_e32 v78, 0x34cd15ae, v74 | |
ds_read_b64 v[73:74], v54 offset:448 | |
v_mul_f32_e32 v81, v47, v78 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_lshrrev_b32_e32 v76, 7, v63 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v69, v69, v74 | |
v_mov_b32_e32 v74, 0x3c739487 | |
v_mul_f32_e64 v68, v68, -v73 | |
v_madak_f32_e32 v73, v83, v82, 0x3ded3cb2 | |
v_madak_f32_e32 v74, v74, v82, 0x3f01e2bc | |
v_mad_f32 v73, v73, v82, 1.0 | |
v_mac_f32_e32 v73, v81, v74 | |
v_mov_b32_e32 v74, 0xb2951928 | |
v_rcp_f32_e32 v73, v73 | |
v_madak_f32_e32 v74, v74, v82, 0xb85ffb93 | |
v_mov_b32_e32 v83, 0x35c55945 | |
v_madak_f32_e32 v83, v83, v82, 0x3a83ca0c | |
v_madak_f32_e32 v74, v74, v82, 0xbc9ded90 | |
v_madak_f32_e32 v83, v83, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v74, v74, v82, 0xbf409397 | |
v_mac_f32_e32 v74, v81, v83 | |
v_mul_f32_e32 v73, v51, v73 | |
v_mul_f32_e32 v73, v74, v73 | |
v_rsq_f32_e32 v74, v78 | |
v_and_b32_e32 v76, 1, v76 | |
v_cmp_eq_u32_e32 vcc, 1, v76 | |
v_cndmask_b32_e64 v76, 0, 1.0, vcc | |
v_cmp_gt_f32_e32 vcc, s26, v78 | |
v_mul_f32_e32 v78, v74, v74 | |
v_mul_f32_e32 v81, v76, v78 | |
v_mac_f32_e32 v73, v74, v81 | |
v_mul_f32_e32 v74, v78, v78 | |
v_mul_f32_e32 v74, v76, v74 | |
v_mul_f32_e32 v74, v78, v74 | |
v_mac_f32_e32 v68, v74, v69 | |
v_cndmask_b32_e64 v69, 0, 1.0, vcc | |
v_mul_f32_e32 v69, v69, v78 | |
v_mul_f32_e32 v69, v74, v69 | |
v_mul_f32_e32 v68, v68, v69 | |
v_mac_f32_e32 v68, v73, v79 | |
v_mad_f32 v80, -v68, v72, v80 | |
v_mad_f32 v10, v72, v68, v10 | |
v_mad_f32 v77, -v68, v71, v77 | |
v_mad_f32 v9, v71, v68, v9 | |
v_mad_f32 v75, -v68, v70, v75 | |
v_mad_f32 v8, v70, v68, v8 | |
v_mac_f32_e32 v66, v0, v65 | |
BB5_42: ; %Flow1179 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB5_43: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v5, v75 | |
ds_write_b32 v6, v77 | |
ds_write_b32 v7, v80 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB5_49 | |
s_cbranch_execz BB5_49 | |
BB5_44: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshlrev_b32_e32 v68, 6, v2 | |
v_add_i32_e32 v65, vcc, v0, v68 | |
v_lshlrev_b32_e32 v65, 2, v65 | |
v_add_i32_e32 v69, vcc, s10, v65 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v65, v69 | |
v_add_i32_e32 v70, vcc, 8, v0 | |
v_or_b32_e32 v71, 1, v0 | |
v_cmp_lt_i32_e32 vcc, v71, v70 | |
s_and_saveexec_b64 s[14:15], vcc | |
s_xor_b64 s[14:15], exec, s[14:15] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB5_46 | |
s_cbranch_execz BB5_46 | |
BB5_45: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[70:71], v69 offset0:1 offset1:2 | |
v_or_b32_e32 v74, 3, v0 | |
v_add_i32_e32 v68, vcc, v74, v68 | |
v_lshlrev_b32_e32 v68, 2, v68 | |
ds_read2_b32 v[72:73], v69 offset0:3 offset1:4 | |
v_add_i32_e32 v68, vcc, s10, v68 | |
ds_read_b32 v75, v69 offset:28 | |
ds_read2_b32 v[68:69], v68 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v65, v65, v70 | |
v_add_f32_e32 v65, v71, v65 | |
v_add_f32_e32 v65, v72, v65 | |
v_add_f32_e32 v65, v73, v65 | |
v_add_f32_e32 v65, v68, v65 | |
v_add_f32_e32 v65, v69, v65 | |
v_add_f32_e32 v65, v75, v65 | |
BB5_46: ; %._crit_edge.i118 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_mul_lo_i32 v64, v64, 3 | |
v_mov_b32_e32 v72, s13 | |
s_mov_b64 s[14:15], s[34:35] | |
v_add_i32_e32 v68, vcc, v64, v2 | |
v_ashrrev_i32_e32 v69, 31, v68 | |
v_lshl_b64 v[70:71], v[68:69], 2 | |
v_add_i32_e32 v68, vcc, s12, v70 | |
v_addc_u32_e32 v69, vcc, v71, v72, vcc | |
buffer_load_dword v71, v[70:71], s[12:15], 0 addr64 | |
s_mov_b64 s[14:15], 0 | |
s_waitcnt vmcnt(0) | |
BB5_47: ; Parent Loop BB5_7 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v70, v65, v71 | |
v_mov_b32_e32 v73, v71 | |
v_mov_b32_e32 v72, v70 | |
buffer_atomic_cmpswap v[72:73], v[68:69], s[32:35], 0 addr64 glc | |
v_mov_b32_e32 v64, -1 | |
v_mov_b32_e32 v64, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v72, v71 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v71, v72 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB5_47 | |
; BB#48: ; %Flow1177 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB5_49: ; %Flow1178 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
BB5_50: ; %Flow1187 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[38:39] | |
v_and_b32_e32 v64, 0xff00, v67 | |
v_cmp_ne_u32_e32 vcc, 0, v64 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[14:15], exec, s[4:5] | |
; mask branch BB5_90 | |
s_cbranch_execz BB5_90 | |
BB5_51: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v74, v52 offset:4 | |
s_mov_b64 s[40:41], s[16:17] | |
s_mov_b64 s[42:43], s[34:35] | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v64, 3, v74 | |
v_add_i32_e32 v64, vcc, v64, v1 | |
v_ashrrev_i32_e32 v65, 31, v64 | |
v_lshl_b64 v[68:69], v[64:65], 4 | |
v_lshl_b64 v[75:76], v[64:65], 3 | |
buffer_load_dwordx4 v[70:73], v[68:69], s[40:43], 0 addr64 | |
s_mov_b64 s[40:41], s[20:21] | |
buffer_load_dwordx2 v[68:69], v[75:76], s[40:43], 0 addr64 | |
v_lshrrev_b32_e32 v65, 8, v67 | |
v_mov_b32_e32 v75, 0 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[18:19], exec, s[4:5] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
s_waitcnt vmcnt(0) | |
; mask branch BB5_55 | |
s_cbranch_execz BB5_55 | |
BB5_52: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset1:1 | |
v_cmp_ne_u32_e32 vcc, v40, v74 | |
s_and_b64 s[4:5], exec, s[2:3] | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[4:5], s[22:23], s[4:5] | |
v_subrev_f32_e32 v79, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v75, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v76, v72, v87 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mul_f32_e32 v75, s11, v75 | |
v_cmp_lt_f32_e32 vcc, v81, v75 | |
v_mov_b32_e32 v75, 0 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[22:23], exec, s[4:5] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
; mask branch BB5_54 | |
s_cbranch_execz BB5_54 | |
BB5_53: ; in Loop: Header=BB5_7 Depth=1 | |
v_max_f32_e32 v77, 0x34cd15ae, v81 | |
v_mul_f32_e32 v81, v47, v77 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_madak_f32_e32 v83, v83, v82, 0x3ded3cb2 | |
v_mov_b32_e32 v84, 0x3c739487 | |
v_madak_f32_e32 v84, v84, v82, 0x3f01e2bc | |
v_mad_f32 v83, v83, v82, 1.0 | |
v_mac_f32_e32 v83, v81, v84 | |
v_mov_b32_e32 v84, 0xb2951928 | |
v_madak_f32_e32 v84, v84, v82, 0xb85ffb93 | |
v_mov_b32_e32 v85, 0x35c55945 | |
v_madak_f32_e32 v85, v85, v82, 0x3a83ca0c | |
v_madak_f32_e32 v84, v84, v82, 0xbc9ded90 | |
v_madak_f32_e32 v85, v85, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v82, v84, v82, 0xbf409397 | |
v_cmp_gt_f32_e32 vcc, s26, v77 | |
v_mac_f32_e32 v82, v81, v85 | |
v_rcp_f32_e32 v81, v83 | |
v_rsq_f32_e32 v77, v77 | |
v_lshrrev_b32_e32 v80, 8, v63 | |
v_and_b32_e32 v80, 1, v80 | |
v_cmp_eq_u32_e64 s[4:5], 1, v80 | |
v_mul_f32_e32 v81, v51, v81 | |
v_cndmask_b32_e64 v80, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v83, v77, v77 | |
v_mul_f32_e32 v82, v82, v81 | |
v_mul_f32_e32 v81, v80, v83 | |
v_mac_f32_e32 v82, v77, v81 | |
v_mul_f32_e32 v77, v83, v83 | |
s_mov_b32 m0, -1 | |
v_mul_f32_e32 v77, v80, v77 | |
ds_read_b64 v[80:81], v54 | |
v_mul_f32_e32 v77, v83, v77 | |
v_mul_f32_e32 v75, v73, v88 | |
v_mac_f32_e32 v46, v0, v65 | |
v_mul_f32_e64 v84, v65, -v0 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v81, v69, v81 | |
v_mul_f32_e64 v80, v68, -v80 | |
v_mac_f32_e32 v80, v77, v81 | |
v_cndmask_b32_e64 v81, 0, 1.0, vcc | |
v_mul_f32_e32 v81, v81, v83 | |
v_mul_f32_e32 v77, v77, v81 | |
v_mul_f32_e32 v81, v80, v77 | |
v_mac_f32_e32 v81, v82, v75 | |
v_mad_f32 v45, v76, v81, v45 | |
v_mad_f32 v44, v78, v81, v44 | |
v_mad_f32 v43, v79, v81, v43 | |
v_mul_f32_e64 v80, v81, -v76 | |
v_mul_f32_e64 v77, v81, -v78 | |
v_mul_f32_e64 v75, v81, -v79 | |
BB5_54: ; %Flow1175 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB5_55: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 9, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_59 | |
s_cbranch_execz BB5_59 | |
BB5_56: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:16 offset1:17 | |
v_cmp_ne_u32_e32 vcc, v62, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_58 | |
s_cbranch_execz BB5_58 | |
BB5_57: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 9, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:64 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v37, v79, v81, v37 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v36, v78, v81, v36 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v35, v76, v81, v35 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v38, v0, v65 | |
BB5_58: ; %Flow1174 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_59: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 10, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_63 | |
s_cbranch_execz BB5_63 | |
BB5_60: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:32 offset1:33 | |
v_cmp_ne_u32_e32 vcc, v61, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_62 | |
s_cbranch_execz BB5_62 | |
BB5_61: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 10, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:128 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v29, v79, v81, v29 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v28, v78, v81, v28 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v27, v76, v81, v27 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v30, v0, v65 | |
BB5_62: ; %Flow1173 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_63: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 11, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_67 | |
s_cbranch_execz BB5_67 | |
BB5_64: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:48 offset1:49 | |
v_cmp_ne_u32_e32 vcc, v60, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_66 | |
s_cbranch_execz BB5_66 | |
BB5_65: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 11, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:192 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v25, v79, v81, v25 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v24, v78, v81, v24 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v23, v76, v81, v23 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v26, v0, v65 | |
BB5_66: ; %Flow1172 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_67: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 12, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_71 | |
s_cbranch_execz BB5_71 | |
BB5_68: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:64 offset1:65 | |
v_cmp_ne_u32_e32 vcc, v59, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_70 | |
s_cbranch_execz BB5_70 | |
BB5_69: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 12, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:256 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v21, v79, v81, v21 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v20, v78, v81, v20 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v19, v76, v81, v19 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v22, v0, v65 | |
BB5_70: ; %Flow1171 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_71: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 13, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_75 | |
s_cbranch_execz BB5_75 | |
BB5_72: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:80 offset1:81 | |
v_cmp_ne_u32_e32 vcc, v58, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_74 | |
s_cbranch_execz BB5_74 | |
BB5_73: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 13, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:320 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v17, v79, v81, v17 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v16, v78, v81, v16 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v15, v76, v81, v15 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v18, v0, v65 | |
BB5_74: ; %Flow1170 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_75: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 14, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_79 | |
s_cbranch_execz BB5_79 | |
BB5_76: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:96 offset1:97 | |
v_cmp_ne_u32_e32 vcc, v57, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_78 | |
s_cbranch_execz BB5_78 | |
BB5_77: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 14, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:384 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v13, v79, v81, v13 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v12, v78, v81, v12 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v11, v76, v81, v11 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v14, v0, v65 | |
BB5_78: ; %Flow1169 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_79: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 15, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_83 | |
s_cbranch_execz BB5_83 | |
BB5_80: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:112 offset1:113 | |
v_cmp_ne_u32_e32 vcc, v33, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v71, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v70, v70, v85 | |
v_mul_f32_e32 v74, v71, v71 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mac_f32_e32 v74, v70, v70 | |
v_mac_f32_e32 v74, v72, v72 | |
v_mul_f32_e32 v76, s11, v76 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v74, v76 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_82 | |
s_cbranch_execz BB5_82 | |
BB5_81: ; in Loop: Header=BB5_7 Depth=1 | |
v_mul_f32_e32 v79, v73, v88 | |
s_mov_b32 m0, -1 | |
v_mad_f32 v73, -v65, v0, v84 | |
v_max_f32_e32 v78, 0x34cd15ae, v74 | |
ds_read_b64 v[73:74], v54 offset:448 | |
v_mul_f32_e32 v81, v47, v78 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_lshrrev_b32_e32 v76, 15, v63 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v69, v69, v74 | |
v_mov_b32_e32 v74, 0x3c739487 | |
v_mul_f32_e64 v68, v68, -v73 | |
v_madak_f32_e32 v73, v83, v82, 0x3ded3cb2 | |
v_madak_f32_e32 v74, v74, v82, 0x3f01e2bc | |
v_mad_f32 v73, v73, v82, 1.0 | |
v_mac_f32_e32 v73, v81, v74 | |
v_mov_b32_e32 v74, 0xb2951928 | |
v_rcp_f32_e32 v73, v73 | |
v_madak_f32_e32 v74, v74, v82, 0xb85ffb93 | |
v_mov_b32_e32 v83, 0x35c55945 | |
v_madak_f32_e32 v83, v83, v82, 0x3a83ca0c | |
v_madak_f32_e32 v74, v74, v82, 0xbc9ded90 | |
v_madak_f32_e32 v83, v83, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v74, v74, v82, 0xbf409397 | |
v_mac_f32_e32 v74, v81, v83 | |
v_mul_f32_e32 v73, v51, v73 | |
v_mul_f32_e32 v73, v74, v73 | |
v_rsq_f32_e32 v74, v78 | |
v_and_b32_e32 v76, 1, v76 | |
v_cmp_eq_u32_e32 vcc, 1, v76 | |
v_cndmask_b32_e64 v76, 0, 1.0, vcc | |
v_cmp_gt_f32_e32 vcc, s26, v78 | |
v_mul_f32_e32 v78, v74, v74 | |
v_mul_f32_e32 v81, v76, v78 | |
v_mac_f32_e32 v73, v74, v81 | |
v_mul_f32_e32 v74, v78, v78 | |
v_mul_f32_e32 v74, v76, v74 | |
v_mul_f32_e32 v74, v78, v74 | |
v_mac_f32_e32 v68, v74, v69 | |
v_cndmask_b32_e64 v69, 0, 1.0, vcc | |
v_mul_f32_e32 v69, v69, v78 | |
v_mul_f32_e32 v69, v74, v69 | |
v_mul_f32_e32 v68, v68, v69 | |
v_mac_f32_e32 v68, v73, v79 | |
v_mad_f32 v80, -v68, v72, v80 | |
v_mad_f32 v10, v72, v68, v10 | |
v_mad_f32 v77, -v68, v71, v77 | |
v_mad_f32 v9, v71, v68, v9 | |
v_mad_f32 v75, -v68, v70, v75 | |
v_mad_f32 v8, v70, v68, v8 | |
v_mac_f32_e32 v66, v0, v65 | |
BB5_82: ; %Flow1168 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_83: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v5, v75 | |
ds_write_b32 v6, v77 | |
ds_write_b32 v7, v80 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB5_89 | |
s_cbranch_execz BB5_89 | |
BB5_84: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshlrev_b32_e32 v68, 6, v2 | |
v_add_i32_e32 v65, vcc, v0, v68 | |
v_lshlrev_b32_e32 v65, 2, v65 | |
v_add_i32_e32 v69, vcc, s10, v65 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v65, v69 | |
v_add_i32_e32 v70, vcc, 8, v0 | |
v_or_b32_e32 v71, 1, v0 | |
v_cmp_lt_i32_e32 vcc, v71, v70 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB5_86 | |
s_cbranch_execz BB5_86 | |
BB5_85: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[70:71], v69 offset0:1 offset1:2 | |
v_or_b32_e32 v74, 3, v0 | |
v_add_i32_e32 v68, vcc, v74, v68 | |
v_lshlrev_b32_e32 v68, 2, v68 | |
ds_read2_b32 v[72:73], v69 offset0:3 offset1:4 | |
v_add_i32_e32 v68, vcc, s10, v68 | |
ds_read_b32 v75, v69 offset:28 | |
ds_read2_b32 v[68:69], v68 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v65, v65, v70 | |
v_add_f32_e32 v65, v71, v65 | |
v_add_f32_e32 v65, v72, v65 | |
v_add_f32_e32 v65, v73, v65 | |
v_add_f32_e32 v65, v68, v65 | |
v_add_f32_e32 v65, v69, v65 | |
v_add_f32_e32 v65, v75, v65 | |
BB5_86: ; %._crit_edge.i72 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_mul_lo_i32 v64, v64, 3 | |
v_mov_b32_e32 v72, s13 | |
s_mov_b64 s[40:41], s[12:13] | |
s_mov_b64 s[42:43], s[34:35] | |
v_add_i32_e32 v68, vcc, v64, v2 | |
v_ashrrev_i32_e32 v69, 31, v68 | |
v_lshl_b64 v[70:71], v[68:69], 2 | |
v_add_i32_e32 v68, vcc, s12, v70 | |
v_addc_u32_e32 v69, vcc, v71, v72, vcc | |
buffer_load_dword v71, v[70:71], s[40:43], 0 addr64 | |
s_mov_b64 s[18:19], 0 | |
s_waitcnt vmcnt(0) | |
BB5_87: ; Parent Loop BB5_7 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v70, v65, v71 | |
v_mov_b32_e32 v73, v71 | |
v_mov_b32_e32 v72, v70 | |
buffer_atomic_cmpswap v[72:73], v[68:69], s[32:35], 0 addr64 glc | |
v_mov_b32_e32 v64, -1 | |
v_mov_b32_e32 v64, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v72, v71 | |
s_or_b64 s[18:19], vcc, s[18:19] | |
v_mov_b32_e32 v71, v72 | |
s_andn2_b64 exec, exec, s[18:19] | |
s_cbranch_execnz BB5_87 | |
; BB#88: ; %Flow1166 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_89: ; %Flow1167 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
BB5_90: ; %Flow1176 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_and_b32_e32 v64, 0xff0000, v67 | |
v_cmp_ne_u32_e32 vcc, 0, v64 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[14:15], exec, s[4:5] | |
; mask branch BB5_130 | |
s_cbranch_execz BB5_130 | |
BB5_91: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v74, v52 offset:8 | |
s_mov_b64 s[40:41], s[16:17] | |
s_mov_b64 s[42:43], s[34:35] | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v64, 3, v74 | |
v_add_i32_e32 v64, vcc, v64, v1 | |
v_ashrrev_i32_e32 v65, 31, v64 | |
v_lshl_b64 v[68:69], v[64:65], 4 | |
v_lshl_b64 v[75:76], v[64:65], 3 | |
buffer_load_dwordx4 v[70:73], v[68:69], s[40:43], 0 addr64 | |
s_mov_b64 s[40:41], s[20:21] | |
buffer_load_dwordx2 v[68:69], v[75:76], s[40:43], 0 addr64 | |
v_lshrrev_b32_e32 v65, 16, v67 | |
v_mov_b32_e32 v75, 0 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[18:19], exec, s[4:5] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
s_waitcnt vmcnt(0) | |
; mask branch BB5_95 | |
s_cbranch_execz BB5_95 | |
BB5_92: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset1:1 | |
v_cmp_ne_u32_e32 vcc, v40, v74 | |
s_and_b64 s[4:5], exec, s[2:3] | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[4:5], s[22:23], s[4:5] | |
v_subrev_f32_e32 v79, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v75, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v76, v72, v87 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mul_f32_e32 v75, s11, v75 | |
v_cmp_lt_f32_e32 vcc, v81, v75 | |
v_mov_b32_e32 v75, 0 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[22:23], exec, s[4:5] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
; mask branch BB5_94 | |
s_cbranch_execz BB5_94 | |
BB5_93: ; in Loop: Header=BB5_7 Depth=1 | |
v_max_f32_e32 v77, 0x34cd15ae, v81 | |
v_mul_f32_e32 v81, v47, v77 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_madak_f32_e32 v83, v83, v82, 0x3ded3cb2 | |
v_mov_b32_e32 v84, 0x3c739487 | |
v_madak_f32_e32 v84, v84, v82, 0x3f01e2bc | |
v_mad_f32 v83, v83, v82, 1.0 | |
v_mac_f32_e32 v83, v81, v84 | |
v_mov_b32_e32 v84, 0xb2951928 | |
v_madak_f32_e32 v84, v84, v82, 0xb85ffb93 | |
v_mov_b32_e32 v85, 0x35c55945 | |
v_madak_f32_e32 v85, v85, v82, 0x3a83ca0c | |
v_madak_f32_e32 v84, v84, v82, 0xbc9ded90 | |
v_madak_f32_e32 v85, v85, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v82, v84, v82, 0xbf409397 | |
v_cmp_gt_f32_e32 vcc, s26, v77 | |
v_mac_f32_e32 v82, v81, v85 | |
v_rcp_f32_e32 v81, v83 | |
v_rsq_f32_e32 v77, v77 | |
v_lshrrev_b32_e32 v80, 16, v63 | |
v_and_b32_e32 v80, 1, v80 | |
v_cmp_eq_u32_e64 s[4:5], 1, v80 | |
v_mul_f32_e32 v81, v51, v81 | |
v_cndmask_b32_e64 v80, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v83, v77, v77 | |
v_mul_f32_e32 v82, v82, v81 | |
v_mul_f32_e32 v81, v80, v83 | |
v_mac_f32_e32 v82, v77, v81 | |
v_mul_f32_e32 v77, v83, v83 | |
s_mov_b32 m0, -1 | |
v_mul_f32_e32 v77, v80, v77 | |
ds_read_b64 v[80:81], v54 | |
v_mul_f32_e32 v77, v83, v77 | |
v_mul_f32_e32 v75, v73, v88 | |
v_mac_f32_e32 v46, v0, v65 | |
v_mul_f32_e64 v84, v65, -v0 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v81, v69, v81 | |
v_mul_f32_e64 v80, v68, -v80 | |
v_mac_f32_e32 v80, v77, v81 | |
v_cndmask_b32_e64 v81, 0, 1.0, vcc | |
v_mul_f32_e32 v81, v81, v83 | |
v_mul_f32_e32 v77, v77, v81 | |
v_mul_f32_e32 v81, v80, v77 | |
v_mac_f32_e32 v81, v82, v75 | |
v_mad_f32 v45, v76, v81, v45 | |
v_mad_f32 v44, v78, v81, v44 | |
v_mad_f32 v43, v79, v81, v43 | |
v_mul_f32_e64 v80, v81, -v76 | |
v_mul_f32_e64 v77, v81, -v78 | |
v_mul_f32_e64 v75, v81, -v79 | |
BB5_94: ; %Flow1164 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB5_95: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 17, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_99 | |
s_cbranch_execz BB5_99 | |
BB5_96: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:16 offset1:17 | |
v_cmp_ne_u32_e32 vcc, v62, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_98 | |
s_cbranch_execz BB5_98 | |
BB5_97: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 17, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:64 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v37, v79, v81, v37 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v36, v78, v81, v36 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v35, v76, v81, v35 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v38, v0, v65 | |
BB5_98: ; %Flow1163 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_99: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 18, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_103 | |
s_cbranch_execz BB5_103 | |
BB5_100: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:32 offset1:33 | |
v_cmp_ne_u32_e32 vcc, v61, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_102 | |
s_cbranch_execz BB5_102 | |
BB5_101: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 18, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:128 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v29, v79, v81, v29 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v28, v78, v81, v28 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v27, v76, v81, v27 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v30, v0, v65 | |
BB5_102: ; %Flow1162 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_103: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 19, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_107 | |
s_cbranch_execz BB5_107 | |
BB5_104: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:48 offset1:49 | |
v_cmp_ne_u32_e32 vcc, v60, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_106 | |
s_cbranch_execz BB5_106 | |
BB5_105: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 19, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:192 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v25, v79, v81, v25 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v24, v78, v81, v24 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v23, v76, v81, v23 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v26, v0, v65 | |
BB5_106: ; %Flow1161 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_107: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 20, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_111 | |
s_cbranch_execz BB5_111 | |
BB5_108: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:64 offset1:65 | |
v_cmp_ne_u32_e32 vcc, v59, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_110 | |
s_cbranch_execz BB5_110 | |
BB5_109: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 20, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:256 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v21, v79, v81, v21 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v20, v78, v81, v20 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v19, v76, v81, v19 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v22, v0, v65 | |
BB5_110: ; %Flow1160 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_111: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 21, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_115 | |
s_cbranch_execz BB5_115 | |
BB5_112: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:80 offset1:81 | |
v_cmp_ne_u32_e32 vcc, v58, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_114 | |
s_cbranch_execz BB5_114 | |
BB5_113: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 21, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:320 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v17, v79, v81, v17 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v16, v78, v81, v16 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v15, v76, v81, v15 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v18, v0, v65 | |
BB5_114: ; %Flow1159 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_115: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 22, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_119 | |
s_cbranch_execz BB5_119 | |
BB5_116: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:96 offset1:97 | |
v_cmp_ne_u32_e32 vcc, v57, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_118 | |
s_cbranch_execz BB5_118 | |
BB5_117: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 22, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:384 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v13, v79, v81, v13 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v12, v78, v81, v12 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v11, v76, v81, v11 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v14, v0, v65 | |
BB5_118: ; %Flow1158 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_119: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 23, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_123 | |
s_cbranch_execz BB5_123 | |
BB5_120: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:112 offset1:113 | |
v_cmp_ne_u32_e32 vcc, v33, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v71, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v70, v70, v85 | |
v_mul_f32_e32 v74, v71, v71 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mac_f32_e32 v74, v70, v70 | |
v_mac_f32_e32 v74, v72, v72 | |
v_mul_f32_e32 v76, s11, v76 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v74, v76 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_122 | |
s_cbranch_execz BB5_122 | |
BB5_121: ; in Loop: Header=BB5_7 Depth=1 | |
v_mul_f32_e32 v79, v73, v88 | |
s_mov_b32 m0, -1 | |
v_mad_f32 v73, -v65, v0, v84 | |
v_max_f32_e32 v78, 0x34cd15ae, v74 | |
ds_read_b64 v[73:74], v54 offset:448 | |
v_mul_f32_e32 v81, v47, v78 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_lshrrev_b32_e32 v76, 23, v63 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v69, v69, v74 | |
v_mov_b32_e32 v74, 0x3c739487 | |
v_mul_f32_e64 v68, v68, -v73 | |
v_madak_f32_e32 v73, v83, v82, 0x3ded3cb2 | |
v_madak_f32_e32 v74, v74, v82, 0x3f01e2bc | |
v_mad_f32 v73, v73, v82, 1.0 | |
v_mac_f32_e32 v73, v81, v74 | |
v_mov_b32_e32 v74, 0xb2951928 | |
v_rcp_f32_e32 v73, v73 | |
v_madak_f32_e32 v74, v74, v82, 0xb85ffb93 | |
v_mov_b32_e32 v83, 0x35c55945 | |
v_madak_f32_e32 v83, v83, v82, 0x3a83ca0c | |
v_madak_f32_e32 v74, v74, v82, 0xbc9ded90 | |
v_madak_f32_e32 v83, v83, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v74, v74, v82, 0xbf409397 | |
v_mac_f32_e32 v74, v81, v83 | |
v_mul_f32_e32 v73, v51, v73 | |
v_mul_f32_e32 v73, v74, v73 | |
v_rsq_f32_e32 v74, v78 | |
v_and_b32_e32 v76, 1, v76 | |
v_cmp_eq_u32_e32 vcc, 1, v76 | |
v_cndmask_b32_e64 v76, 0, 1.0, vcc | |
v_cmp_gt_f32_e32 vcc, s26, v78 | |
v_mul_f32_e32 v78, v74, v74 | |
v_mul_f32_e32 v81, v76, v78 | |
v_mac_f32_e32 v73, v74, v81 | |
v_mul_f32_e32 v74, v78, v78 | |
v_mul_f32_e32 v74, v76, v74 | |
v_mul_f32_e32 v74, v78, v74 | |
v_mac_f32_e32 v68, v74, v69 | |
v_cndmask_b32_e64 v69, 0, 1.0, vcc | |
v_mul_f32_e32 v69, v69, v78 | |
v_mul_f32_e32 v69, v74, v69 | |
v_mul_f32_e32 v68, v68, v69 | |
v_mac_f32_e32 v68, v73, v79 | |
v_mad_f32 v80, -v68, v72, v80 | |
v_mad_f32 v10, v72, v68, v10 | |
v_mad_f32 v77, -v68, v71, v77 | |
v_mad_f32 v9, v71, v68, v9 | |
v_mad_f32 v75, -v68, v70, v75 | |
v_mad_f32 v8, v70, v68, v8 | |
v_mac_f32_e32 v66, v0, v65 | |
BB5_122: ; %Flow1157 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_123: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v5, v75 | |
ds_write_b32 v6, v77 | |
ds_write_b32 v7, v80 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB5_129 | |
s_cbranch_execz BB5_129 | |
BB5_124: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshlrev_b32_e32 v68, 6, v2 | |
v_add_i32_e32 v65, vcc, v0, v68 | |
v_lshlrev_b32_e32 v65, 2, v65 | |
v_add_i32_e32 v69, vcc, s10, v65 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v65, v69 | |
v_add_i32_e32 v70, vcc, 8, v0 | |
v_or_b32_e32 v71, 1, v0 | |
v_cmp_lt_i32_e32 vcc, v71, v70 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB5_126 | |
s_cbranch_execz BB5_126 | |
BB5_125: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[70:71], v69 offset0:1 offset1:2 | |
v_or_b32_e32 v74, 3, v0 | |
v_add_i32_e32 v68, vcc, v74, v68 | |
v_lshlrev_b32_e32 v68, 2, v68 | |
ds_read2_b32 v[72:73], v69 offset0:3 offset1:4 | |
v_add_i32_e32 v68, vcc, s10, v68 | |
ds_read_b32 v75, v69 offset:28 | |
ds_read2_b32 v[68:69], v68 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v65, v65, v70 | |
v_add_f32_e32 v65, v71, v65 | |
v_add_f32_e32 v65, v72, v65 | |
v_add_f32_e32 v65, v73, v65 | |
v_add_f32_e32 v65, v68, v65 | |
v_add_f32_e32 v65, v69, v65 | |
v_add_f32_e32 v65, v75, v65 | |
BB5_126: ; %._crit_edge.i26 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_mul_lo_i32 v64, v64, 3 | |
v_mov_b32_e32 v72, s13 | |
s_mov_b64 s[40:41], s[12:13] | |
s_mov_b64 s[42:43], s[34:35] | |
v_add_i32_e32 v68, vcc, v64, v2 | |
v_ashrrev_i32_e32 v69, 31, v68 | |
v_lshl_b64 v[70:71], v[68:69], 2 | |
v_add_i32_e32 v68, vcc, s12, v70 | |
v_addc_u32_e32 v69, vcc, v71, v72, vcc | |
buffer_load_dword v71, v[70:71], s[40:43], 0 addr64 | |
s_mov_b64 s[18:19], 0 | |
s_waitcnt vmcnt(0) | |
BB5_127: ; Parent Loop BB5_7 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v70, v65, v71 | |
v_mov_b32_e32 v73, v71 | |
v_mov_b32_e32 v72, v70 | |
buffer_atomic_cmpswap v[72:73], v[68:69], s[32:35], 0 addr64 glc | |
v_mov_b32_e32 v64, -1 | |
v_mov_b32_e32 v64, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v72, v71 | |
s_or_b64 s[18:19], vcc, s[18:19] | |
v_mov_b32_e32 v71, v72 | |
s_andn2_b64 exec, exec, s[18:19] | |
s_cbranch_execnz BB5_127 | |
; BB#128: ; %Flow1155 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_129: ; %Flow1156 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
BB5_130: ; %Flow1165 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
v_mov_b32_e32 v64, 0xffffff | |
v_cmp_lt_u32_e32 vcc, v64, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[14:15], exec, s[4:5] | |
; mask branch BB5_170 | |
s_cbranch_execz BB5_170 | |
BB5_131: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v74, v52 offset:12 | |
s_mov_b64 s[40:41], s[16:17] | |
s_mov_b64 s[42:43], s[34:35] | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v64, 3, v74 | |
v_add_i32_e32 v64, vcc, v64, v1 | |
v_ashrrev_i32_e32 v65, 31, v64 | |
v_lshl_b64 v[68:69], v[64:65], 4 | |
v_lshl_b64 v[75:76], v[64:65], 3 | |
buffer_load_dwordx4 v[70:73], v[68:69], s[40:43], 0 addr64 | |
s_mov_b64 s[40:41], s[20:21] | |
buffer_load_dwordx2 v[68:69], v[75:76], s[40:43], 0 addr64 | |
v_lshrrev_b32_e32 v65, 24, v67 | |
v_mov_b32_e32 v75, 0 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[18:19], exec, s[4:5] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
s_waitcnt vmcnt(0) | |
; mask branch BB5_135 | |
s_cbranch_execz BB5_135 | |
BB5_132: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset1:1 | |
v_cmp_ne_u32_e32 vcc, v40, v74 | |
s_and_b64 s[4:5], exec, s[2:3] | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[4:5], s[22:23], s[4:5] | |
v_subrev_f32_e32 v79, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v75, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v76, v72, v87 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mul_f32_e32 v75, s11, v75 | |
v_cmp_lt_f32_e32 vcc, v81, v75 | |
v_mov_b32_e32 v75, 0 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_mov_b32_e32 v77, v75 | |
v_mov_b32_e32 v80, v75 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[22:23], exec, s[4:5] | |
; implicit-def: %VGPR81_VGPR82_VGPR83_VGPR84 | |
; mask branch BB5_134 | |
s_cbranch_execz BB5_134 | |
BB5_133: ; in Loop: Header=BB5_7 Depth=1 | |
v_max_f32_e32 v77, 0x34cd15ae, v81 | |
v_mul_f32_e32 v81, v47, v77 | |
v_mul_f32_e32 v82, v81, v81 | |
v_mov_b32_e32 v83, 0x3a92b707 | |
v_madak_f32_e32 v83, v83, v82, 0x3ded3cb2 | |
v_mov_b32_e32 v84, 0x3c739487 | |
v_madak_f32_e32 v84, v84, v82, 0x3f01e2bc | |
v_mad_f32 v83, v83, v82, 1.0 | |
v_mac_f32_e32 v83, v81, v84 | |
v_mov_b32_e32 v84, 0xb2951928 | |
v_madak_f32_e32 v84, v84, v82, 0xb85ffb93 | |
v_mov_b32_e32 v85, 0x35c55945 | |
v_madak_f32_e32 v85, v85, v82, 0x3a83ca0c | |
v_madak_f32_e32 v84, v84, v82, 0xbc9ded90 | |
v_madak_f32_e32 v85, v85, v82, 0x3d8eaf3b | |
v_madak_f32_e32 v82, v84, v82, 0xbf409397 | |
v_cmp_gt_f32_e32 vcc, s26, v77 | |
v_mac_f32_e32 v82, v81, v85 | |
v_rcp_f32_e32 v81, v83 | |
v_rsq_f32_e32 v77, v77 | |
v_lshrrev_b32_e32 v80, 24, v63 | |
v_and_b32_e32 v80, 1, v80 | |
v_cmp_eq_u32_e64 s[4:5], 1, v80 | |
v_mul_f32_e32 v81, v51, v81 | |
v_cndmask_b32_e64 v80, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v83, v77, v77 | |
v_mul_f32_e32 v82, v82, v81 | |
v_mul_f32_e32 v81, v80, v83 | |
v_mac_f32_e32 v82, v77, v81 | |
v_mul_f32_e32 v77, v83, v83 | |
s_mov_b32 m0, -1 | |
v_mul_f32_e32 v77, v80, v77 | |
ds_read_b64 v[80:81], v54 | |
v_mul_f32_e32 v77, v83, v77 | |
v_mul_f32_e32 v75, v73, v88 | |
v_mac_f32_e32 v46, v0, v65 | |
v_mul_f32_e64 v84, v65, -v0 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v81, v69, v81 | |
v_mul_f32_e64 v80, v68, -v80 | |
v_mac_f32_e32 v80, v77, v81 | |
v_cndmask_b32_e64 v81, 0, 1.0, vcc | |
v_mul_f32_e32 v81, v81, v83 | |
v_mul_f32_e32 v77, v77, v81 | |
v_mul_f32_e32 v81, v80, v77 | |
v_mac_f32_e32 v81, v82, v75 | |
v_mad_f32 v45, v76, v81, v45 | |
v_mad_f32 v44, v78, v81, v44 | |
v_mad_f32 v43, v79, v81, v43 | |
v_mul_f32_e64 v80, v81, -v76 | |
v_mul_f32_e64 v77, v81, -v78 | |
v_mul_f32_e64 v75, v81, -v79 | |
BB5_134: ; %Flow1153 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[22:23] | |
BB5_135: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_lshrrev_b32_e32 v65, 25, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_139 | |
s_cbranch_execz BB5_139 | |
BB5_136: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:16 offset1:17 | |
v_cmp_ne_u32_e32 vcc, v62, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_138 | |
s_cbranch_execz BB5_138 | |
BB5_137: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 25, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:64 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v37, v79, v81, v37 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v36, v78, v81, v36 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v35, v76, v81, v35 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v38, v0, v65 | |
BB5_138: ; %Flow1152 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_139: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 26, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_143 | |
s_cbranch_execz BB5_143 | |
BB5_140: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:32 offset1:33 | |
v_cmp_ne_u32_e32 vcc, v61, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_142 | |
s_cbranch_execz BB5_142 | |
BB5_141: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 26, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:128 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v29, v79, v81, v29 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v28, v78, v81, v28 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v27, v76, v81, v27 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v30, v0, v65 | |
BB5_142: ; %Flow1151 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_143: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 27, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_147 | |
s_cbranch_execz BB5_147 | |
BB5_144: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:48 offset1:49 | |
v_cmp_ne_u32_e32 vcc, v60, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_146 | |
s_cbranch_execz BB5_146 | |
BB5_145: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 27, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:192 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v25, v79, v81, v25 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v24, v78, v81, v24 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v23, v76, v81, v23 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v26, v0, v65 | |
BB5_146: ; %Flow1150 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_147: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 28, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_151 | |
s_cbranch_execz BB5_151 | |
BB5_148: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:64 offset1:65 | |
v_cmp_ne_u32_e32 vcc, v59, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_150 | |
s_cbranch_execz BB5_150 | |
BB5_149: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 28, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:256 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v21, v79, v81, v21 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v20, v78, v81, v20 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v19, v76, v81, v19 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v22, v0, v65 | |
BB5_150: ; %Flow1149 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_151: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 29, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_155 | |
s_cbranch_execz BB5_155 | |
BB5_152: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:80 offset1:81 | |
v_cmp_ne_u32_e32 vcc, v58, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_154 | |
s_cbranch_execz BB5_154 | |
BB5_153: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 29, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:320 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v17, v79, v81, v17 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v16, v78, v81, v16 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v15, v76, v81, v15 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v18, v0, v65 | |
BB5_154: ; %Flow1148 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_155: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_lshrrev_b32_e32 v65, 30, v67 | |
v_and_b32_e32 v65, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v65 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_159 | |
s_cbranch_execz BB5_159 | |
BB5_156: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:96 offset1:97 | |
v_cmp_ne_u32_e32 vcc, v57, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v78, v71, v86 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_subrev_f32_e32 v76, v70, v85 | |
v_mul_f32_e32 v81, v78, v78 | |
v_cndmask_b32_e64 v82, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v79, v72, v87 | |
v_mac_f32_e32 v81, v76, v76 | |
v_mac_f32_e32 v81, v79, v79 | |
v_mul_f32_e32 v82, s11, v82 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v81, v82 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_158 | |
s_cbranch_execz BB5_158 | |
BB5_157: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshrrev_b32_e32 v82, 30, v63 | |
v_max_f32_e32 v85, 0x34cd15ae, v81 | |
v_mul_f32_e32 v83, v73, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_mul_f32_e32 v88, v47, v85 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mul_f32_e32 v81, v88, v88 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v82, v82, v81, 0x3ded3cb2 | |
v_mad_f32 v90, v82, v81, 1.0 | |
v_mov_b32_e32 v82, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v81, 0x3f01e2bc | |
v_mac_f32_e32 v90, v88, v89 | |
v_madak_f32_e32 v82, v82, v81, 0xb85ffb93 | |
v_mov_b32_e32 v89, 0x35c55945 | |
v_rsq_f32_e32 v87, v85 | |
v_madak_f32_e32 v89, v89, v81, 0x3a83ca0c | |
v_madak_f32_e32 v82, v82, v81, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v89, v89, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v82, v81, 0xbf409397 | |
ds_read_b64 v[81:82], v54 offset:384 | |
v_mac_f32_e32 v91, v88, v89 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v90, v90 | |
v_cndmask_b32_e64 v86, 0, 1.0, vcc | |
v_mul_f32_e32 v89, v88, v88 | |
v_mul_f32_e32 v89, v86, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v69, v82 | |
v_mul_f32_e64 v81, v68, -v81 | |
v_mul_f32_e32 v89, v88, v89 | |
v_cmp_gt_f32_e32 vcc, s26, v85 | |
v_mac_f32_e32 v81, v89, v82 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v51, v90 | |
v_mul_f32_e32 v82, v82, v88 | |
v_mul_f32_e32 v82, v89, v82 | |
v_mul_f32_e32 v86, v86, v88 | |
v_mul_f32_e32 v90, v91, v90 | |
v_mac_f32_e32 v90, v87, v86 | |
v_mul_f32_e32 v81, v81, v82 | |
v_mac_f32_e32 v81, v90, v83 | |
v_mad_f32 v80, -v81, v79, v80 | |
v_mad_f32 v13, v79, v81, v13 | |
v_mad_f32 v77, -v81, v78, v77 | |
v_mad_f32 v12, v78, v81, v12 | |
v_mad_f32 v75, -v81, v76, v75 | |
v_mad_f32 v11, v76, v81, v11 | |
v_mad_f32 v84, -v65, v0, v84 | |
v_mac_f32_e32 v14, v0, v65 | |
BB5_158: ; %Flow1147 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_159: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_cmp_gt_i32_e32 vcc, 0, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_163 | |
s_cbranch_execz BB5_163 | |
BB5_160: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[85:88], v53 offset0:112 offset1:113 | |
v_cmp_ne_u32_e32 vcc, v33, v74 | |
s_and_b64 s[18:19], exec, s[2:3] | |
s_or_b64 s[18:19], s[18:19], vcc | |
s_and_b64 s[22:23], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v71, v71, v86 | |
v_subrev_f32_e32 v67, v70, v85 | |
v_mul_f32_e32 v70, v71, v71 | |
s_or_b64 s[18:19], s[22:23], s[18:19] | |
v_cndmask_b32_e64 v74, 0, 1.0, s[18:19] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mac_f32_e32 v70, v67, v67 | |
v_mac_f32_e32 v70, v72, v72 | |
v_mul_f32_e32 v74, s11, v74 | |
v_subrev_f32_e32 v65, v73, v88 | |
v_cmp_lt_f32_e32 vcc, v70, v74 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
; mask branch BB5_162 | |
s_cbranch_execz BB5_162 | |
BB5_161: ; in Loop: Header=BB5_7 Depth=1 | |
v_mul_f32_e32 v78, v73, v88 | |
s_mov_b32 m0, -1 | |
v_mad_f32 v73, -v65, v0, v84 | |
ds_read_b64 v[73:74], v54 offset:448 | |
v_max_f32_e32 v70, 0x34cd15ae, v70 | |
v_mul_f32_e32 v79, v47, v70 | |
v_mul_f32_e32 v81, v79, v79 | |
v_mov_b32_e32 v82, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v69, v69, v74 | |
v_mov_b32_e32 v74, 0x3c739487 | |
v_mul_f32_e64 v68, v68, -v73 | |
v_madak_f32_e32 v73, v82, v81, 0x3ded3cb2 | |
v_madak_f32_e32 v74, v74, v81, 0x3f01e2bc | |
v_mad_f32 v73, v73, v81, 1.0 | |
v_mac_f32_e32 v73, v79, v74 | |
v_mov_b32_e32 v74, 0xb2951928 | |
v_rcp_f32_e32 v73, v73 | |
v_madak_f32_e32 v74, v74, v81, 0xb85ffb93 | |
v_mov_b32_e32 v82, 0x35c55945 | |
v_rsq_f32_e32 v76, v70 | |
v_madak_f32_e32 v82, v82, v81, 0x3a83ca0c | |
v_madak_f32_e32 v74, v74, v81, 0xbc9ded90 | |
v_madak_f32_e32 v82, v82, v81, 0x3d8eaf3b | |
v_madak_f32_e32 v74, v74, v81, 0xbf409397 | |
v_cmp_gt_i32_e32 vcc, 0, v63 | |
v_mac_f32_e32 v74, v79, v82 | |
v_mul_f32_e32 v73, v51, v73 | |
v_cndmask_b32_e64 v63, 0, 1.0, vcc | |
v_mul_f32_e32 v73, v74, v73 | |
v_mul_f32_e32 v74, v76, v76 | |
v_mul_f32_e32 v79, v63, v74 | |
v_mac_f32_e32 v73, v76, v79 | |
v_mul_f32_e32 v76, v74, v74 | |
v_mul_f32_e32 v63, v63, v76 | |
v_mul_f32_e32 v63, v74, v63 | |
v_cmp_gt_f32_e32 vcc, s26, v70 | |
v_mac_f32_e32 v68, v63, v69 | |
v_cndmask_b32_e64 v69, 0, 1.0, vcc | |
v_mul_f32_e32 v69, v69, v74 | |
v_mul_f32_e32 v63, v63, v69 | |
v_mul_f32_e32 v63, v68, v63 | |
v_mac_f32_e32 v63, v73, v78 | |
v_mad_f32 v80, -v63, v72, v80 | |
v_mad_f32 v10, v72, v63, v10 | |
v_mad_f32 v77, -v63, v71, v77 | |
v_mad_f32 v9, v71, v63, v9 | |
v_mad_f32 v75, -v63, v67, v75 | |
v_mad_f32 v8, v67, v63, v8 | |
v_mac_f32_e32 v66, v0, v65 | |
BB5_162: ; %Flow1146 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_163: ; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v5, v75 | |
ds_write_b32 v6, v77 | |
ds_write_b32 v7, v80 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB5_169 | |
s_cbranch_execz BB5_169 | |
BB5_164: ; in Loop: Header=BB5_7 Depth=1 | |
v_lshlrev_b32_e32 v65, 6, v2 | |
v_add_i32_e32 v63, vcc, v0, v65 | |
v_lshlrev_b32_e32 v63, 2, v63 | |
v_add_i32_e32 v67, vcc, s10, v63 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v63, v67 | |
v_add_i32_e32 v68, vcc, 8, v0 | |
v_or_b32_e32 v69, 1, v0 | |
v_cmp_lt_i32_e32 vcc, v69, v68 | |
s_and_saveexec_b64 s[18:19], vcc | |
s_xor_b64 s[18:19], exec, s[18:19] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB5_166 | |
s_cbranch_execz BB5_166 | |
BB5_165: ; in Loop: Header=BB5_7 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[68:69], v67 offset0:1 offset1:2 | |
v_or_b32_e32 v72, 3, v0 | |
v_add_i32_e32 v65, vcc, v72, v65 | |
v_lshlrev_b32_e32 v65, 2, v65 | |
ds_read2_b32 v[70:71], v67 offset0:3 offset1:4 | |
v_add_i32_e32 v65, vcc, s10, v65 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v63, v63, v68 | |
ds_read_b32 v73, v67 offset:28 | |
ds_read2_b32 v[67:68], v65 offset0:2 offset1:3 | |
v_add_f32_e32 v63, v69, v63 | |
v_add_f32_e32 v63, v70, v63 | |
v_add_f32_e32 v63, v71, v63 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v63, v67, v63 | |
v_add_f32_e32 v63, v68, v63 | |
v_add_f32_e32 v63, v73, v63 | |
BB5_166: ; %._crit_edge.i | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
v_mul_lo_i32 v64, v64, 3 | |
v_mov_b32_e32 v68, s13 | |
s_mov_b64 s[40:41], s[12:13] | |
s_mov_b64 s[42:43], s[34:35] | |
v_add_i32_e32 v64, vcc, v64, v2 | |
v_ashrrev_i32_e32 v65, 31, v64 | |
v_lshl_b64 v[64:65], v[64:65], 2 | |
v_add_i32_e32 v67, vcc, s12, v64 | |
v_addc_u32_e32 v68, vcc, v65, v68, vcc | |
buffer_load_dword v65, v[64:65], s[40:43], 0 addr64 | |
s_mov_b64 s[18:19], 0 | |
s_waitcnt vmcnt(0) | |
BB5_167: ; Parent Loop BB5_7 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_mov_b32_e32 v64, -1 | |
v_add_f32_e32 v64, v63, v65 | |
v_mov_b32_e32 v70, v65 | |
v_mov_b32_e32 v69, v64 | |
buffer_atomic_cmpswap v[69:70], v[67:68], s[32:35], 0 addr64 glc | |
v_mov_b32_e32 v64, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v69, v65 | |
s_or_b64 s[18:19], vcc, s[18:19] | |
v_mov_b32_e32 v65, v69 | |
s_andn2_b64 exec, exec, s[18:19] | |
s_cbranch_execnz BB5_167 | |
; BB#168: ; %Flow1144 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[18:19] | |
BB5_169: ; %Flow1145 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
BB5_170: ; %Flow1154 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[14:15] | |
BB5_171: ; %Flow1188 | |
; in Loop: Header=BB5_7 Depth=1 | |
s_or_b64 exec, exec, s[36:37] | |
v_add_i32_e32 v55, vcc, 1, v55 | |
v_addc_u32_e32 v56, vcc, 0, v56, vcc | |
v_cmp_ne_u32_e32 vcc, v55, v34 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB5_7 | |
BB5_172: ; %Flow1191 | |
s_mov_b32 m0, -1 | |
ds_write_b32 v5, v43 | |
ds_write_b32 v6, v44 | |
ds_write_b32 v7, v45 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_load_dword s0, s[6:7], 0x32 | |
v_cmp_ne_u32_e32 vcc, 22, v32 | |
v_lshlrev_b32_e32 v18, 2, v39 | |
v_mov_b32_e32 v3, 0 | |
v_lshlrev_b32_e32 v14, 6, v31 | |
s_waitcnt lgkmcnt(0) | |
v_cmp_ne_u32_e64 s[0:1], s0, 0 | |
s_and_b64 s[2:3], s[0:1], vcc | |
v_add_i32_e32 v18, vcc, s10, v18 | |
v_add_i32_e32 v26, vcc, 64, v2 | |
v_add_i32_e32 v22, vcc, 0x80, v2 | |
v_cmp_gt_i32_e64 s[0:1], 4, v1 | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_183 | |
s_cbranch_execz BB5_183 | |
BB5_173: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v3, v18 offset:128 | |
ds_read_b32 v30, v18 | |
v_add_i32_e32 v31, vcc, v0, v26 | |
v_lshlrev_b32_e32 v31, 2, v31 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v30 | |
ds_write_b32 v18, v3 | |
v_add_i32_e32 v30, vcc, s10, v31 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v3, v30 offset:128 | |
ds_read_b32 v31, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v31 | |
ds_write_b32 v18, v3 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v3, vcc, v0, v22 | |
v_lshlrev_b32_e32 v3, 2, v3 | |
v_add_i32_e32 v31, vcc, s10, v3 | |
ds_read_b32 v3, v31 offset:128 | |
ds_read_b32 v32, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v32 | |
v_mov_b32_e32 v32, 0 | |
ds_write_b32 v18, v3 offset:512 | |
s_waitcnt lgkmcnt(0) | |
; implicit-def: %VGPR3 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_175 | |
BB5_174: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_mov_b32_e32 v3, 0 | |
v_cndmask_b32_e64 v32, 0, -1, vcc | |
BB5_175: ; %Flow1141 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB5_177 | |
s_cbranch_execz BB5_177 | |
BB5_176: ; %.thread85.i | |
s_mov_b32 m0, -1 | |
ds_read_b32 v32, v18 offset:64 | |
ds_read_b32 v33, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v32, v32, v33 | |
ds_write_b32 v18, v32 | |
ds_read_b32 v30, v30 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v32, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v30, v32 | |
ds_write_b32 v18, v30 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v30, v31 offset:64 | |
ds_read_b32 v31, v18 offset:512 | |
v_mov_b32_e32 v32, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v30, v31 | |
ds_write_b32 v18, v30 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB5_177: ; %Flow1142 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v32 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_182 | |
s_cbranch_execz BB5_182 | |
BB5_178: | |
v_mov_b32_e32 v30, 0xe0 | |
v_mad_i32_i24 v30, v30, v1, v18 | |
s_mov_b32 m0, -1 | |
v_add_i32_e32 v3, vcc, v14, v2 | |
ds_read_b32 v31, v30 | |
ds_read_b32 v30, v30 offset:32 | |
v_mul_lo_i32 v3, v3, 3 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_mov_b64 s[16:17], s[12:13] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v31, v30 | |
v_add_i32_e32 v31, vcc, v1, v3 | |
v_ashrrev_i32_e32 v32, 31, v31 | |
v_lshl_b64 v[33:34], v[31:32], 2 | |
v_add_i32_e32 v31, vcc, s12, v33 | |
v_mov_b32_e32 v3, s13 | |
v_addc_u32_e32 v32, vcc, v34, v3, vcc | |
buffer_load_dword v34, v[33:34], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB5_179: ; =>This Inner Loop Header: Depth=1 | |
v_add_f32_e32 v33, v30, v34 | |
v_mov_b32_e32 v39, v34 | |
v_mov_b32_e32 v38, v33 | |
buffer_atomic_cmpswap v[38:39], v[31:32], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v3, -1 | |
v_mov_b32_e32 v3, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v38, v34 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v34, v38 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB5_179 | |
; BB#180: ; %atomicAdd_g_f.exit.i | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v31, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v31 | |
v_mov_b32_e32 v3, 0 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB5_182 | |
; BB#181: | |
v_mov_b32_e32 v3, v30 | |
BB5_182: ; %Flow1143 | |
s_or_b64 exec, exec, s[6:7] | |
BB5_183: ; %reduce_force_i_pow2.exit | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v35 | |
ds_write_b32 v6, v36 | |
ds_write_b32 v7, v37 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_194 | |
s_cbranch_execz BB5_194 | |
BB5_184: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v30, v18 offset:128 | |
ds_read_b32 v31, v18 | |
v_add_i32_e32 v32, vcc, v0, v26 | |
v_lshlrev_b32_e32 v32, 2, v32 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v30, v31 | |
ds_write_b32 v18, v30 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v30, vcc, s10, v32 | |
ds_read_b32 v31, v30 offset:128 | |
ds_read_b32 v32, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v31, v31, v32 | |
ds_write_b32 v18, v31 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v31, vcc, v0, v22 | |
v_lshlrev_b32_e32 v31, 2, v31 | |
v_add_i32_e32 v31, vcc, s10, v31 | |
ds_read_b32 v32, v31 offset:128 | |
ds_read_b32 v33, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v32, v32, v33 | |
ds_write_b32 v18, v32 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v32, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_186 | |
BB5_185: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v32, 0, -1, vcc | |
BB5_186: ; %Flow1138 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB5_188 | |
s_cbranch_execz BB5_188 | |
BB5_187: ; %.thread85.i491 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v32, v18 offset:64 | |
ds_read_b32 v33, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v32, v32, v33 | |
ds_write_b32 v18, v32 | |
ds_read_b32 v30, v30 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v32, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v30, v32 | |
ds_write_b32 v18, v30 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v30, v31 offset:64 | |
ds_read_b32 v31, v18 offset:512 | |
v_mov_b32_e32 v32, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v30, v31 | |
ds_write_b32 v18, v30 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB5_188: ; %Flow1139 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v32 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_193 | |
s_cbranch_execz BB5_193 | |
BB5_189: | |
v_or_b32_e32 v30, 8, v14 | |
v_add_i32_e32 v30, vcc, v30, v2 | |
v_mul_lo_i32 v31, v30, 3 | |
v_mov_b32_e32 v30, 0xe0 | |
v_mad_i32_i24 v30, v30, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v32, v30 | |
ds_read_b32 v30, v30 offset:32 | |
v_add_i32_e32 v31, vcc, v1, v31 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_mov_b64 s[16:17], s[12:13] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v30, v32, v30 | |
v_ashrrev_i32_e32 v32, 31, v31 | |
v_lshl_b64 v[33:34], v[31:32], 2 | |
v_add_i32_e32 v31, vcc, s12, v33 | |
v_mov_b32_e32 v32, s13 | |
v_addc_u32_e32 v32, vcc, v34, v32, vcc | |
buffer_load_dword v34, v[33:34], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB5_190: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v33, -1 | |
v_add_f32_e32 v33, v30, v34 | |
v_mov_b32_e32 v36, v34 | |
v_mov_b32_e32 v35, v33 | |
buffer_atomic_cmpswap v[35:36], v[31:32], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v33, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v35, v34 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v34, v35 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB5_190 | |
; BB#191: ; %atomicAdd_g_f.exit.i479 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v31, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v31 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB5_193 | |
; BB#192: | |
v_add_f32_e32 v3, v30, v3 | |
BB5_193: ; %Flow1140 | |
s_or_b64 exec, exec, s[6:7] | |
BB5_194: ; %reduce_force_i_pow2.exit493 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v27 | |
ds_write_b32 v6, v28 | |
ds_write_b32 v7, v29 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_205 | |
s_cbranch_execz BB5_205 | |
BB5_195: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v27, v18 offset:128 | |
ds_read_b32 v28, v18 | |
v_add_i32_e32 v29, vcc, v0, v26 | |
v_lshlrev_b32_e32 v29, 2, v29 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v28 | |
ds_write_b32 v18, v27 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v27, vcc, s10, v29 | |
ds_read_b32 v28, v27 offset:128 | |
ds_read_b32 v29, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v28, v28, v29 | |
ds_write_b32 v18, v28 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v28, vcc, v0, v22 | |
v_lshlrev_b32_e32 v28, 2, v28 | |
v_add_i32_e32 v28, vcc, s10, v28 | |
ds_read_b32 v29, v28 offset:128 | |
ds_read_b32 v30, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v29, v29, v30 | |
ds_write_b32 v18, v29 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v29, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_197 | |
BB5_196: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v29, 0, -1, vcc | |
BB5_197: ; %Flow1135 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB5_199 | |
s_cbranch_execz BB5_199 | |
BB5_198: ; %.thread85.i442 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v29, v18 offset:64 | |
ds_read_b32 v30, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v29, v29, v30 | |
ds_write_b32 v18, v29 | |
ds_read_b32 v27, v27 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v29, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v29 | |
ds_write_b32 v18, v27 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v27, v28 offset:64 | |
ds_read_b32 v28, v18 offset:512 | |
v_mov_b32_e32 v29, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v28 | |
ds_write_b32 v18, v27 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB5_199: ; %Flow1136 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v29 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_204 | |
s_cbranch_execz BB5_204 | |
BB5_200: | |
v_or_b32_e32 v27, 16, v14 | |
v_add_i32_e32 v27, vcc, v27, v2 | |
v_mul_lo_i32 v28, v27, 3 | |
v_mov_b32_e32 v27, 0xe0 | |
v_mad_i32_i24 v27, v27, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v29, v27 | |
ds_read_b32 v27, v27 offset:32 | |
v_add_i32_e32 v28, vcc, v1, v28 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_mov_b64 s[16:17], s[12:13] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v29, v27 | |
v_ashrrev_i32_e32 v29, 31, v28 | |
v_lshl_b64 v[30:31], v[28:29], 2 | |
v_add_i32_e32 v28, vcc, s12, v30 | |
v_mov_b32_e32 v29, s13 | |
v_addc_u32_e32 v29, vcc, v31, v29, vcc | |
buffer_load_dword v31, v[30:31], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB5_201: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v30, -1 | |
v_add_f32_e32 v30, v27, v31 | |
v_mov_b32_e32 v33, v31 | |
v_mov_b32_e32 v32, v30 | |
buffer_atomic_cmpswap v[32:33], v[28:29], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v30, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v32, v31 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v31, v32 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB5_201 | |
; BB#202: ; %atomicAdd_g_f.exit.i430 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v28, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v28 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB5_204 | |
; BB#203: | |
v_add_f32_e32 v3, v27, v3 | |
BB5_204: ; %Flow1137 | |
s_or_b64 exec, exec, s[6:7] | |
BB5_205: ; %reduce_force_i_pow2.exit444 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v23 | |
ds_write_b32 v6, v24 | |
ds_write_b32 v7, v25 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_216 | |
s_cbranch_execz BB5_216 | |
BB5_206: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v23, v18 offset:128 | |
ds_read_b32 v24, v18 | |
v_add_i32_e32 v25, vcc, v0, v26 | |
v_lshlrev_b32_e32 v25, 2, v25 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v23, v23, v24 | |
ds_write_b32 v18, v23 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v23, vcc, s10, v25 | |
ds_read_b32 v24, v23 offset:128 | |
ds_read_b32 v25, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v24, v24, v25 | |
ds_write_b32 v18, v24 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v24, vcc, v0, v22 | |
v_lshlrev_b32_e32 v24, 2, v24 | |
v_add_i32_e32 v24, vcc, s10, v24 | |
ds_read_b32 v25, v24 offset:128 | |
ds_read_b32 v27, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v25, v25, v27 | |
ds_write_b32 v18, v25 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v25, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_208 | |
BB5_207: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v25, 0, -1, vcc | |
BB5_208: ; %Flow1132 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB5_210 | |
s_cbranch_execz BB5_210 | |
BB5_209: ; %.thread85.i393 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v25, v18 offset:64 | |
ds_read_b32 v27, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v25, v25, v27 | |
ds_write_b32 v18, v25 | |
ds_read_b32 v23, v23 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v25, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v23, v23, v25 | |
ds_write_b32 v18, v23 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v23, v24 offset:64 | |
ds_read_b32 v24, v18 offset:512 | |
v_mov_b32_e32 v25, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v23, v23, v24 | |
ds_write_b32 v18, v23 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB5_210: ; %Flow1133 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v25 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_215 | |
s_cbranch_execz BB5_215 | |
BB5_211: | |
v_or_b32_e32 v23, 24, v14 | |
v_add_i32_e32 v23, vcc, v23, v2 | |
v_mul_lo_i32 v24, v23, 3 | |
v_mov_b32_e32 v23, 0xe0 | |
v_mad_i32_i24 v23, v23, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v25, v23 | |
ds_read_b32 v23, v23 offset:32 | |
v_add_i32_e32 v24, vcc, v1, v24 | |
v_mov_b32_e32 v28, s13 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v23, v25, v23 | |
v_ashrrev_i32_e32 v25, 31, v24 | |
v_lshl_b64 v[24:25], v[24:25], 2 | |
v_add_i32_e32 v27, vcc, s12, v24 | |
s_mov_b64 s[16:17], s[12:13] | |
v_addc_u32_e32 v28, vcc, v25, v28, vcc | |
buffer_load_dword v25, v[24:25], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB5_212: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v24, -1 | |
v_add_f32_e32 v24, v23, v25 | |
v_mov_b32_e32 v30, v25 | |
v_mov_b32_e32 v29, v24 | |
buffer_atomic_cmpswap v[29:30], v[27:28], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v24, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v29, v25 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v25, v29 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB5_212 | |
; BB#213: ; %atomicAdd_g_f.exit.i381 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v24, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v24 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB5_215 | |
; BB#214: | |
v_add_f32_e32 v3, v23, v3 | |
BB5_215: ; %Flow1134 | |
s_or_b64 exec, exec, s[6:7] | |
BB5_216: ; %reduce_force_i_pow2.exit395 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v19 | |
ds_write_b32 v6, v20 | |
ds_write_b32 v7, v21 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_227 | |
s_cbranch_execz BB5_227 | |
BB5_217: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v19, v18 offset:128 | |
ds_read_b32 v20, v18 | |
v_add_i32_e32 v21, vcc, v0, v26 | |
v_lshlrev_b32_e32 v21, 2, v21 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v19, v19, v20 | |
ds_write_b32 v18, v19 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v19, vcc, s10, v21 | |
ds_read_b32 v20, v19 offset:128 | |
ds_read_b32 v21, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v20, v20, v21 | |
ds_write_b32 v18, v20 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v20, vcc, v0, v22 | |
v_lshlrev_b32_e32 v20, 2, v20 | |
v_add_i32_e32 v20, vcc, s10, v20 | |
ds_read_b32 v21, v20 offset:128 | |
ds_read_b32 v23, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v21, v21, v23 | |
ds_write_b32 v18, v21 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v21, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_219 | |
BB5_218: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v21, 0, -1, vcc | |
BB5_219: ; %Flow1129 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB5_221 | |
s_cbranch_execz BB5_221 | |
BB5_220: ; %.thread85.i344 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v21, v18 offset:64 | |
ds_read_b32 v23, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v21, v21, v23 | |
ds_write_b32 v18, v21 | |
ds_read_b32 v19, v19 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v21, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v19, v19, v21 | |
ds_write_b32 v18, v19 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v19, v20 offset:64 | |
ds_read_b32 v20, v18 offset:512 | |
v_mov_b32_e32 v21, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v19, v19, v20 | |
ds_write_b32 v18, v19 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB5_221: ; %Flow1130 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v21 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_226 | |
s_cbranch_execz BB5_226 | |
BB5_222: | |
v_or_b32_e32 v19, 32, v14 | |
v_add_i32_e32 v19, vcc, v19, v2 | |
v_mul_lo_i32 v20, v19, 3 | |
v_mov_b32_e32 v19, 0xe0 | |
v_mad_i32_i24 v19, v19, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v21, v19 | |
ds_read_b32 v19, v19 offset:32 | |
v_add_i32_e32 v20, vcc, v1, v20 | |
v_mov_b32_e32 v23, s13 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v19, v21, v19 | |
v_ashrrev_i32_e32 v21, 31, v20 | |
v_lshl_b64 v[20:21], v[20:21], 2 | |
v_add_i32_e32 v27, vcc, s12, v20 | |
s_mov_b64 s[16:17], s[12:13] | |
v_addc_u32_e32 v28, vcc, v21, v23, vcc | |
buffer_load_dword v21, v[20:21], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB5_223: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v20, -1 | |
v_add_f32_e32 v20, v19, v21 | |
v_mov_b32_e32 v24, v21 | |
v_mov_b32_e32 v23, v20 | |
buffer_atomic_cmpswap v[23:24], v[27:28], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v20, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v23, v21 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v21, v23 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB5_223 | |
; BB#224: ; %atomicAdd_g_f.exit.i332 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v20, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v20 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB5_226 | |
; BB#225: | |
v_add_f32_e32 v3, v19, v3 | |
BB5_226: ; %Flow1131 | |
s_or_b64 exec, exec, s[6:7] | |
BB5_227: ; %reduce_force_i_pow2.exit346 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v15 | |
ds_write_b32 v6, v16 | |
ds_write_b32 v7, v17 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_238 | |
s_cbranch_execz BB5_238 | |
BB5_228: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v15, v18 offset:128 | |
ds_read_b32 v16, v18 | |
v_add_i32_e32 v17, vcc, v0, v26 | |
v_lshlrev_b32_e32 v17, 2, v17 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v15, v15, v16 | |
ds_write_b32 v18, v15 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v15, vcc, s10, v17 | |
ds_read_b32 v16, v15 offset:128 | |
ds_read_b32 v17, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v16, v16, v17 | |
ds_write_b32 v18, v16 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v16, vcc, v0, v22 | |
v_lshlrev_b32_e32 v16, 2, v16 | |
v_add_i32_e32 v16, vcc, s10, v16 | |
ds_read_b32 v17, v16 offset:128 | |
ds_read_b32 v19, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v17, v17, v19 | |
ds_write_b32 v18, v17 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v17, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_230 | |
BB5_229: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v17, 0, -1, vcc | |
BB5_230: ; %Flow1126 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB5_232 | |
s_cbranch_execz BB5_232 | |
BB5_231: ; %.thread85.i295 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v17, v18 offset:64 | |
ds_read_b32 v19, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v17, v17, v19 | |
ds_write_b32 v18, v17 | |
ds_read_b32 v15, v15 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v17, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v15, v15, v17 | |
ds_write_b32 v18, v15 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v15, v16 offset:64 | |
ds_read_b32 v16, v18 offset:512 | |
v_mov_b32_e32 v17, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v15, v15, v16 | |
ds_write_b32 v18, v15 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB5_232: ; %Flow1127 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v17 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_237 | |
s_cbranch_execz BB5_237 | |
BB5_233: | |
v_or_b32_e32 v15, 40, v14 | |
v_add_i32_e32 v15, vcc, v15, v2 | |
v_mul_lo_i32 v16, v15, 3 | |
v_mov_b32_e32 v15, 0xe0 | |
v_mad_i32_i24 v15, v15, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v17, v15 | |
ds_read_b32 v15, v15 offset:32 | |
v_add_i32_e32 v16, vcc, v1, v16 | |
v_mov_b32_e32 v19, s13 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v15, v17, v15 | |
v_ashrrev_i32_e32 v17, 31, v16 | |
v_lshl_b64 v[16:17], v[16:17], 2 | |
v_add_i32_e32 v27, vcc, s12, v16 | |
s_mov_b64 s[16:17], s[12:13] | |
v_addc_u32_e32 v28, vcc, v17, v19, vcc | |
buffer_load_dword v17, v[16:17], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB5_234: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v16, -1 | |
v_add_f32_e32 v16, v15, v17 | |
v_mov_b32_e32 v20, v17 | |
v_mov_b32_e32 v19, v16 | |
buffer_atomic_cmpswap v[19:20], v[27:28], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v16, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v19, v17 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v17, v19 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB5_234 | |
; BB#235: ; %atomicAdd_g_f.exit.i283 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v16, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v16 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB5_237 | |
; BB#236: | |
v_add_f32_e32 v3, v15, v3 | |
BB5_237: ; %Flow1128 | |
s_or_b64 exec, exec, s[6:7] | |
BB5_238: ; %reduce_force_i_pow2.exit297 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v11 | |
ds_write_b32 v6, v12 | |
ds_write_b32 v7, v13 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_249 | |
s_cbranch_execz BB5_249 | |
BB5_239: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v11, v18 offset:128 | |
ds_read_b32 v12, v18 | |
v_add_i32_e32 v13, vcc, v0, v26 | |
v_lshlrev_b32_e32 v13, 2, v13 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v11, v11, v12 | |
ds_write_b32 v18, v11 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v11, vcc, s10, v13 | |
ds_read_b32 v12, v11 offset:128 | |
ds_read_b32 v13, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v12, v12, v13 | |
ds_write_b32 v18, v12 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v12, vcc, v0, v22 | |
v_lshlrev_b32_e32 v12, 2, v12 | |
v_add_i32_e32 v12, vcc, s10, v12 | |
ds_read_b32 v13, v12 offset:128 | |
ds_read_b32 v15, v18 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v13, v13, v15 | |
ds_write_b32 v18, v13 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v13, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_241 | |
BB5_240: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v13, 0, -1, vcc | |
BB5_241: ; %Flow1123 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB5_243 | |
s_cbranch_execz BB5_243 | |
BB5_242: ; %.thread85.i246 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v13, v18 offset:64 | |
ds_read_b32 v15, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v13, v13, v15 | |
ds_write_b32 v18, v13 | |
ds_read_b32 v11, v11 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v13, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v11, v11, v13 | |
ds_write_b32 v18, v11 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v11, v12 offset:64 | |
ds_read_b32 v12, v18 offset:512 | |
v_mov_b32_e32 v13, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v11, v11, v12 | |
ds_write_b32 v18, v11 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB5_243: ; %Flow1124 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v13 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB5_248 | |
s_cbranch_execz BB5_248 | |
BB5_244: | |
v_or_b32_e32 v11, 48, v14 | |
v_add_i32_e32 v11, vcc, v11, v2 | |
v_mul_lo_i32 v12, v11, 3 | |
v_mov_b32_e32 v11, 0xe0 | |
v_mad_i32_i24 v11, v11, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v13, v11 | |
ds_read_b32 v11, v11 offset:32 | |
v_add_i32_e32 v12, vcc, v1, v12 | |
v_mov_b32_e32 v15, s13 | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v11, v13, v11 | |
v_ashrrev_i32_e32 v13, 31, v12 | |
v_lshl_b64 v[12:13], v[12:13], 2 | |
v_add_i32_e32 v27, vcc, s12, v12 | |
s_mov_b64 s[16:17], s[12:13] | |
v_addc_u32_e32 v28, vcc, v13, v15, vcc | |
buffer_load_dword v13, v[12:13], s[16:19], 0 addr64 | |
s_mov_b64 s[16:17], 0 | |
s_mov_b64 s[14:15], s[16:17] | |
s_waitcnt vmcnt(0) | |
BB5_245: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v12, -1 | |
v_add_f32_e32 v12, v11, v13 | |
v_mov_b32_e32 v16, v13 | |
v_mov_b32_e32 v15, v12 | |
buffer_atomic_cmpswap v[15:16], v[27:28], s[16:19], 0 addr64 glc | |
v_mov_b32_e32 v12, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v15, v13 | |
s_or_b64 s[14:15], vcc, s[14:15] | |
v_mov_b32_e32 v13, v15 | |
s_andn2_b64 exec, exec, s[14:15] | |
s_cbranch_execnz BB5_245 | |
; BB#246: ; %atomicAdd_g_f.exit.i234 | |
s_or_b64 exec, exec, s[14:15] | |
s_and_b64 s[14:15], exec, s[2:3] | |
v_cndmask_b32_e64 v12, 0, 1, s[14:15] | |
v_cmp_ne_u32_e32 vcc, 1, v12 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB5_248 | |
; BB#247: | |
v_add_f32_e32 v3, v11, v3 | |
BB5_248: ; %Flow1125 | |
s_or_b64 exec, exec, s[6:7] | |
BB5_249: ; %reduce_force_i_pow2.exit248 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v5, v8 | |
ds_write_b32 v6, v9 | |
ds_write_b32 v7, v10 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[0:1], exec, s[4:5] | |
; mask branch BB5_260 | |
s_cbranch_execz BB5_260 | |
BB5_250: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v5, v18 offset:128 | |
ds_read_b32 v6, v18 | |
v_add_i32_e32 v7, vcc, v0, v26 | |
v_lshlrev_b32_e32 v7, 2, v7 | |
v_add_i32_e32 v0, vcc, v0, v22 | |
v_lshlrev_b32_e32 v0, 2, v0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v5, v5, v6 | |
ds_write_b32 v18, v5 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v5, vcc, s10, v7 | |
ds_read_b32 v6, v5 offset:128 | |
ds_read_b32 v7, v18 offset:256 | |
v_add_i32_e32 v0, vcc, s10, v0 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v6, v6, v7 | |
ds_write_b32 v18, v6 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v6, v0 offset:128 | |
ds_read_b32 v7, v18 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v6, v6, v7 | |
ds_write_b32 v18, v6 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v6, 0 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_252 | |
BB5_251: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v6, 0, -1, vcc | |
BB5_252: ; %Flow1120 | |
s_or_saveexec_b64 s[4:5], s[4:5] | |
s_xor_b64 exec, exec, s[4:5] | |
; mask branch BB5_254 | |
s_cbranch_execz BB5_254 | |
BB5_253: ; %.thread85.i197 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v6, v18 offset:64 | |
ds_read_b32 v7, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v6, v6, v7 | |
ds_write_b32 v18, v6 | |
ds_read_b32 v5, v5 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v6, v18 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v5, v5, v6 | |
ds_write_b32 v18, v5 offset:256 | |
ds_read_b32 v0, v0 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v5, v18 offset:512 | |
v_mov_b32_e32 v6, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v0, v0, v5 | |
ds_write_b32 v18, v0 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB5_254: ; %Flow1121 | |
s_or_b64 exec, exec, s[4:5] | |
v_cmp_ne_u32_e32 vcc, 0, v6 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB5_259 | |
s_cbranch_execz BB5_259 | |
BB5_255: | |
v_or_b32_e32 v0, 56, v14 | |
v_add_i32_e32 v0, vcc, v0, v2 | |
v_mul_lo_i32 v2, v0, 3 | |
v_mov_b32_e32 v0, 0xe0 | |
v_mad_i32_i24 v0, v0, v1, v18 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v5, v0 | |
ds_read_b32 v0, v0 offset:32 | |
s_mov_b32 s15, 0xf000 | |
s_mov_b32 s14, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v0, v5, v0 | |
v_add_i32_e32 v5, vcc, v1, v2 | |
v_ashrrev_i32_e32 v6, 31, v5 | |
v_lshl_b64 v[7:8], v[5:6], 2 | |
v_add_i32_e32 v5, vcc, s12, v7 | |
v_mov_b32_e32 v2, s13 | |
v_addc_u32_e32 v6, vcc, v8, v2, vcc | |
buffer_load_dword v8, v[7:8], s[12:15], 0 addr64 | |
s_mov_b64 s[12:13], 0 | |
s_mov_b64 s[6:7], s[12:13] | |
s_waitcnt vmcnt(0) | |
BB5_256: ; =>This Inner Loop Header: Depth=1 | |
v_add_f32_e32 v7, v0, v8 | |
v_mov_b32_e32 v10, v8 | |
v_mov_b32_e32 v9, v7 | |
buffer_atomic_cmpswap v[9:10], v[5:6], s[12:15], 0 addr64 glc | |
v_mov_b32_e32 v2, -1 | |
v_mov_b32_e32 v2, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v9, v8 | |
s_or_b64 s[6:7], vcc, s[6:7] | |
v_mov_b32_e32 v8, v9 | |
s_andn2_b64 exec, exec, s[6:7] | |
s_cbranch_execnz BB5_256 | |
; BB#257: ; %atomicAdd_g_f.exit.i185 | |
s_or_b64 exec, exec, s[6:7] | |
s_and_b64 s[6:7], exec, s[2:3] | |
v_cndmask_b32_e64 v2, 0, 1, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 1, v2 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB5_259 | |
; BB#258: | |
v_add_f32_e32 v3, v0, v3 | |
BB5_259: ; %Flow1122 | |
s_or_b64 exec, exec, s[4:5] | |
BB5_260: ; %reduce_force_i_pow2.exit199 | |
s_or_b64 exec, exec, s[0:1] | |
s_barrier | |
v_cmp_gt_u32_e32 vcc, 3, v1 | |
s_and_b64 s[0:1], exec, s[2:3] | |
s_and_b64 s[0:1], vcc, s[0:1] | |
s_and_saveexec_b64 s[2:3], s[0:1] | |
s_xor_b64 s[0:1], exec, s[2:3] | |
; mask branch BB5_264 | |
s_cbranch_execz BB5_264 | |
BB5_261: | |
v_add_i32_e32 v0, vcc, v4, v1 | |
v_mov_b32_e32 v1, 0 | |
v_lshl_b64 v[0:1], v[0:1], 2 | |
v_add_i32_e32 v4, vcc, s8, v0 | |
v_mov_b32_e32 v2, s9 | |
s_mov_b32 s11, 0xf000 | |
s_mov_b32 s10, 0 | |
v_addc_u32_e32 v5, vcc, v1, v2, vcc | |
buffer_load_dword v1, v[0:1], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_mov_b64 s[2:3], s[8:9] | |
s_waitcnt vmcnt(0) | |
BB5_262: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v0, -1 | |
v_add_f32_e32 v0, v3, v1 | |
v_mov_b32_e32 v7, v1 | |
v_mov_b32_e32 v6, v0 | |
buffer_atomic_cmpswap v[6:7], v[4:5], s[8:11], 0 addr64 glc | |
v_mov_b32_e32 v0, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v6, v1 | |
s_or_b64 s[2:3], vcc, s[2:3] | |
v_mov_b32_e32 v1, v6 | |
s_andn2_b64 exec, exec, s[2:3] | |
s_cbranch_execnz BB5_262 | |
; BB#263: ; %Flow | |
s_or_b64 exec, exec, s[2:3] | |
BB5_264: ; %Flow1119 | |
s_or_b64 exec, exec, s[0:1] | |
s_endpgm | |
.Lfunc_end5: | |
.size nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl, .Lfunc_end5-nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl | |
.section .AMDGPU.csdata | |
; Kernel info: | |
; codeLenInByte = 21480 | |
; NumSgprs: 46 | |
; NumVgprs: 92 | |
; FloatMode: 192 | |
; IeeeMode: 1 | |
; ScratchSize: 0 | |
; LDSByteSize: 0 bytes/workgroup (compile time only) | |
; SGPRBlocks: 5 | |
; VGPRBlocks: 22 | |
; NumSGPRsForWavesPerEU: 46 | |
; NumVGPRsForWavesPerEU: 92 | |
; ReservedVGPRFirst: 0 | |
; ReservedVGPRCount: 0 | |
; COMPUTE_PGM_RSRC2:USER_SGPR: 8 | |
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 | |
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 | |
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 | |
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 1 | |
.section .AMDGPU.config | |
.long 47176 | |
.long 11272668 | |
.long 47180 | |
.long 2192 | |
.long 47200 | |
.long 0 | |
.long 4 | |
.long 0 | |
.long 8 | |
.long 0 | |
.text | |
.globl nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl | |
.p2align 8 | |
.type nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl,@function | |
.amdgpu_hsa_kernel nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl | |
nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl: ; @nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl | |
.amd_kernel_code_t | |
amd_code_version_major = 1 | |
amd_code_version_minor = 0 | |
amd_machine_kind = 1 | |
amd_machine_version_major = 7 | |
amd_machine_version_minor = 0 | |
amd_machine_version_stepping = 1 | |
kernel_code_entry_byte_offset = 256 | |
kernel_code_prefetch_byte_size = 0 | |
max_scratch_backing_memory_byte_size = 0 | |
granulated_workitem_vgpr_count = 28 | |
granulated_wavefront_sgpr_count = 7 | |
priority = 0 | |
float_mode = 192 | |
priv = 0 | |
enable_dx10_clamp = 1 | |
debug_mode = 0 | |
enable_ieee_mode = 1 | |
enable_sgpr_private_segment_wave_byte_offset = 0 | |
user_sgpr_count = 8 | |
enable_sgpr_workgroup_id_x = 1 | |
enable_sgpr_workgroup_id_y = 0 | |
enable_sgpr_workgroup_id_z = 0 | |
enable_sgpr_workgroup_info = 0 | |
enable_vgpr_workitem_id = 1 | |
enable_exception_msb = 0 | |
granulated_lds_size = 0 | |
enable_exception = 0 | |
enable_sgpr_private_segment_buffer = 1 | |
enable_sgpr_dispatch_ptr = 1 | |
enable_sgpr_queue_ptr = 0 | |
enable_sgpr_kernarg_segment_ptr = 1 | |
enable_sgpr_dispatch_id = 0 | |
enable_sgpr_flat_scratch_init = 0 | |
enable_sgpr_private_segment_size = 0 | |
enable_sgpr_grid_workgroup_count_x = 0 | |
enable_sgpr_grid_workgroup_count_y = 0 | |
enable_sgpr_grid_workgroup_count_z = 0 | |
enable_ordered_append_gds = 0 | |
private_element_size = 1 | |
is_ptr64 = 1 | |
is_dynamic_callstack = 0 | |
is_debug_enabled = 0 | |
is_xnack_enabled = 0 | |
workitem_private_segment_byte_size = 0 | |
workgroup_group_segment_byte_size = 0 | |
gds_segment_byte_size = 0 | |
kernarg_segment_byte_size = 232 | |
workgroup_fbarrier_count = 0 | |
wavefront_sgpr_count = 58 | |
workitem_vgpr_count = 116 | |
reserved_vgpr_first = 0 | |
reserved_vgpr_count = 0 | |
reserved_sgpr_first = 0 | |
reserved_sgpr_count = 0 | |
debug_wavefront_private_segment_offset_sgpr = 0 | |
debug_private_segment_buffer_sgpr = 0 | |
kernarg_segment_alignment = 4 | |
group_segment_alignment = 4 | |
private_segment_alignment = 4 | |
wavefront_size = 6 | |
call_convention = 0 | |
runtime_loader_kernel_symbol = 0 | |
.end_amd_kernel_code_t | |
; BB#0: | |
s_load_dwordx2 s[20:21], s[6:7], 0x2c | |
s_mov_b32 s9, 0 | |
s_lshl_b64 s[0:1], s[8:9], 4 | |
v_mov_b32_e32 v4, s1 | |
v_mov_b32_e32 v3, s0 | |
s_mov_b32 s23, 0xf000 | |
s_mov_b32 s22, s9 | |
s_waitcnt lgkmcnt(0) | |
buffer_load_dwordx4 v[36:39], v[3:4], s[20:23], 0 addr64 | |
v_mov_b32_e32 v2, v0 | |
s_load_dwordx2 s[0:1], s[6:7], 0x24 | |
s_load_dwordx2 s[32:33], s[6:7], 0x18 | |
s_mov_b64 s[34:35], s[22:23] | |
s_mov_b64 s[2:3], s[22:23] | |
s_load_dword s14, s[6:7], 0x33 | |
s_load_dwordx2 s[36:37], s[6:7], 0x22 | |
s_mov_b32 m0, -1 | |
s_mov_b64 s[38:39], s[22:23] | |
s_load_dword s18, s[6:7], 0x5 | |
s_waitcnt vmcnt(0) | |
v_lshlrev_b32_e32 v41, 3, v36 | |
v_mul_lo_i32 v4, v37, 3 | |
v_add_i32_e32 v0, vcc, v1, v41 | |
v_lshlrev_b32_e32 v0, 3, v0 | |
v_add_i32_e32 v9, vcc, v2, v0 | |
v_ashrrev_i32_e32 v10, 31, v9 | |
v_ashrrev_i32_e32 v5, 31, v4 | |
v_lshl_b64 v[11:12], v[4:5], 2 | |
v_lshl_b64 v[6:7], v[9:10], 4 | |
s_waitcnt lgkmcnt(0) | |
buffer_load_dwordx4 v[5:8], v[6:7], s[32:35], 0 addr64 | |
buffer_load_dwordx2 v[13:14], v[11:12], s[0:3], 0 addr64 | |
buffer_load_dword v0, v[11:12], s[0:3], 0 addr64 offset:8 | |
s_load_dword s2, s[6:7], 0x2 | |
v_lshlrev_b32_e32 v11, 3, v1 | |
v_add_i32_e32 v40, vcc, v2, v11 | |
s_load_dword s0, s[4:5], 0x1 | |
s_add_i32 s4, s14, 0x420 | |
s_waitcnt lgkmcnt(0) | |
s_and_b32 s0, s0, 0xffff | |
s_waitcnt vmcnt(1) | |
v_add_f32_e32 v15, v6, v14 | |
v_add_f32_e32 v14, v5, v13 | |
s_waitcnt vmcnt(0) | |
v_add_f32_e32 v5, v7, v0 | |
v_lshlrev_b32_e32 v0, 4, v40 | |
v_add_i32_e32 v3, vcc, s14, v0 | |
v_mul_f32_e32 v6, s2, v8 | |
ds_write2_b64 v3, v[14:15], v[5:6] offset1:1 | |
s_waitcnt lgkmcnt(0) | |
v_lshl_b64 v[5:6], v[9:10], 3 | |
buffer_load_dwordx2 v[5:6], v[5:6], s[36:39], 0 addr64 | |
v_mad_u32_u24 v0, s0, v1, v2 | |
v_lshlrev_b32_e32 v7, 3, v40 | |
v_add_i32_e32 v7, vcc, s4, v7 | |
v_or_b32_e32 v3, 32, v0 | |
v_lshrrev_b32_e32 v46, 5, v0 | |
v_cmp_eq_u32_e32 vcc, 32, v3 | |
s_waitcnt vmcnt(0) | |
ds_write_b64 v7, v[5:6] | |
s_and_saveexec_b64 s[0:1], vcc | |
s_xor_b64 s[0:1], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_2 | |
BB6_1: | |
v_lshlrev_b32_e32 v3, 2, v46 | |
v_add_i32_e32 v3, vcc, s14, v3 | |
v_mov_b32_e32 v5, 0 | |
s_mov_b32 m0, -1 | |
ds_write_b32 v3, v5 offset:2336 | |
s_waitcnt lgkmcnt(0) | |
BB6_2: | |
s_or_b64 exec, exec, s[0:1] | |
s_barrier | |
s_load_dwordx2 s[40:41], s[6:7], 0x2e | |
v_cmp_ne_u32_e32 vcc, 22, v37 | |
s_and_b64 vcc, exec, vcc | |
v_mov_b32_e32 v3, 0 | |
v_cmp_eq_u32_e64 s[0:1], 22, v37 | |
s_waitcnt lgkmcnt(0) | |
s_mov_b64 vcc, vcc | |
s_cbranch_vccnz BB6_5 | |
; BB#3: | |
v_ashrrev_i32_e32 v6, 31, v38 | |
v_mov_b32_e32 v5, v38 | |
s_mov_b32 s43, 0xf000 | |
s_mov_b32 s42, 0 | |
v_lshl_b64 v[5:6], v[5:6], 5 | |
buffer_load_dword v5, v[5:6], s[40:43], 0 addr64 | |
s_waitcnt vmcnt(0) | |
v_cmp_ne_u32_e32 vcc, v5, v41 | |
s_and_b64 vcc, exec, vcc | |
v_mov_b32_e32 v5, v3 | |
s_cbranch_vccnz BB6_6 | |
; BB#4: ; %.preheader549.preheader | |
v_lshlrev_b32_e32 v5, 4, v2 | |
v_add_i32_e32 v9, vcc, s14, v5 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[5:8], v9 offset0:1 offset1:17 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v5, 0x41000000 | |
ds_read2_b64 v[12:15], v9 offset0:33 offset1:49 | |
v_mul_f32_e32 v5, s2, v5 | |
ds_read2_b64 v[16:19], v9 offset0:65 offset1:81 | |
v_mad_f32 v10, v6, v6, 0 | |
v_mov_b32_e32 v6, 0x6f800000 | |
v_cmp_lt_f32_e64 vcc, v6, |v5| | |
v_mov_b32_e32 v6, 0x2f800000 | |
s_waitcnt lgkmcnt(0) | |
v_cndmask_b32_e32 v12, 1.0, v6, vcc | |
v_mac_f32_e32 v10, v8, v8 | |
v_mul_f32_e32 v5, v12, v5 | |
v_mac_f32_e32 v10, v13, v13 | |
v_rcp_f32_e32 v13, v5 | |
ds_read2_b64 v[5:8], v9 offset0:97 offset1:113 | |
v_mac_f32_e32 v10, v15, v15 | |
v_mac_f32_e32 v10, v17, v17 | |
v_mac_f32_e32 v10, v19, v19 | |
s_waitcnt lgkmcnt(0) | |
v_mac_f32_e32 v10, v6, v6 | |
v_mac_f32_e32 v10, v8, v8 | |
v_mul_f32_e32 v5, v13, v10 | |
v_mov_b32_e32 v6, 0xbf106ebb | |
v_mul_f32_e32 v5, v5, v12 | |
v_mul_f32_e32 v6, s18, v6 | |
v_mul_f32_e32 v5, v5, v6 | |
s_branch BB6_6 | |
BB6_5: | |
v_mov_b32_e32 v5, v3 | |
BB6_6: ; %.preheader548 | |
s_load_dwordx2 s[28:29], s[6:7], 0x1a | |
v_cmp_lt_i32_e32 vcc, v38, v39 | |
v_mov_b32_e32 v17, -1 | |
s_and_b64 vcc, exec, vcc | |
s_waitcnt lgkmcnt(0) | |
s_mov_b64 vcc, vcc | |
s_cbranch_vccnz BB6_8 | |
; BB#7: ; %.preheader548.._crit_edge_crit_edge | |
v_mov_b32_e32 v8, 0 | |
v_lshlrev_b32_e32 v6, 2, v0 | |
v_mov_b32_e32 v9, v8 | |
v_mov_b32_e32 v10, v8 | |
v_add_i32_e32 v7, vcc, s14, v6 | |
v_mov_b32_e32 v51, v11 | |
v_mov_b32_e32 v16, v11 | |
v_add_i32_e32 v6, vcc, 0x620, v7 | |
v_add_i32_e32 v12, vcc, 0x820, v7 | |
v_add_i32_e32 v7, vcc, 0x720, v7 | |
v_mov_b32_e32 v17, 0 | |
v_mov_b32_e32 v50, v10 | |
v_mov_b32_e32 v49, v9 | |
v_mov_b32_e32 v48, v8 | |
v_mov_b32_e32 v15, v10 | |
v_mov_b32_e32 v14, v9 | |
v_mov_b32_e32 v13, v8 | |
s_branch BB6_9 | |
BB6_8: | |
; implicit-def: %VGPR8 | |
; implicit-def: %VGPR48_VGPR49_VGPR50_VGPR51 | |
; implicit-def: %VGPR6 | |
; implicit-def: %VGPR12 | |
; implicit-def: %VGPR13_VGPR14_VGPR15_VGPR16 | |
; implicit-def: %VGPR7 | |
BB6_9: ; %Flow1256 | |
s_load_dwordx2 s[24:25], s[6:7], 0x20 | |
s_load_dwordx2 s[20:21], s[6:7], 0x1c | |
s_load_dwordx2 s[16:17], s[6:7], 0x1e | |
v_cmp_ne_u32_e32 vcc, 0, v17 | |
v_cndmask_b32_e64 v9, 0, 1, vcc | |
v_mov_b32_e32 v42, v48 | |
v_mov_b32_e32 v32, v48 | |
v_mov_b32_e32 v28, v48 | |
v_mov_b32_e32 v24, v48 | |
v_mov_b32_e32 v20, v48 | |
v_mov_b32_e32 v16, v48 | |
v_cmp_ne_u32_e32 vcc, 1, v9 | |
s_movk_i32 s5, 0x620 | |
s_add_i32 s15, s14, s5 | |
s_and_b64 vcc, exec, vcc | |
v_mov_b32_e32 v43, v49 | |
v_mov_b32_e32 v44, v50 | |
v_mov_b32_e32 v45, v51 | |
v_mov_b32_e32 v33, v49 | |
v_mov_b32_e32 v34, v50 | |
v_mov_b32_e32 v35, v51 | |
v_mov_b32_e32 v29, v49 | |
v_mov_b32_e32 v30, v50 | |
v_mov_b32_e32 v31, v51 | |
v_mov_b32_e32 v25, v49 | |
v_mov_b32_e32 v26, v50 | |
v_mov_b32_e32 v27, v51 | |
v_mov_b32_e32 v21, v49 | |
v_mov_b32_e32 v22, v50 | |
v_mov_b32_e32 v23, v51 | |
v_mov_b32_e32 v17, v49 | |
v_mov_b32_e32 v18, v50 | |
v_mov_b32_e32 v19, v51 | |
s_waitcnt lgkmcnt(0) | |
s_mov_b64 vcc, vcc | |
s_cbranch_vccnz BB6_176 | |
; BB#10: ; %.lr.ph | |
v_or_b32_e32 v6, 4, v1 | |
v_mov_b32_e32 v13, 0 | |
v_cmp_eq_u32_e32 vcc, 4, v6 | |
v_cmp_gt_u32_e64 s[2:3], 4, v2 | |
s_and_b64 s[48:49], s[2:3], vcc | |
v_add_i32_e32 v6, vcc, v1, v2 | |
v_and_b32_e32 v8, 4, v1 | |
s_load_dword s19, s[6:7], 0x6 | |
s_load_dword s22, s[6:7], 0x9 | |
s_load_dword s23, s[6:7], 0xf | |
s_load_dword s26, s[6:7], 0x12 | |
s_load_dwordx2 s[42:43], s[6:7], 0x30 | |
v_mov_b32_e32 v14, v13 | |
v_mov_b32_e32 v15, v13 | |
v_mov_b32_e32 v19, v16 | |
s_add_i32 s8, s14, 0x400 | |
v_lshlrev_b32_e32 v6, 2, v6 | |
v_lshlrev_b32_e32 v8, 2, v8 | |
v_add_i32_e32 v10, vcc, s8, v6 | |
v_lshlrev_b32_e32 v6, 2, v0 | |
v_add_i32_e32 v54, vcc, s8, v8 | |
v_lshlrev_b32_e32 v8, 4, v2 | |
v_mov_b32_e32 v18, v15 | |
v_mov_b32_e32 v17, v14 | |
v_mov_b32_e32 v16, v13 | |
v_add_i32_e32 v12, vcc, s14, v6 | |
v_mov_b32_e32 v23, v16 | |
v_mov_b32_e32 v27, v16 | |
v_mov_b32_e32 v31, v16 | |
v_mov_b32_e32 v35, v16 | |
v_mov_b32_e32 v45, v16 | |
v_mov_b32_e32 v51, v16 | |
v_mul_f32_e64 v9, s18, s18 | |
v_mov_b32_e32 v47, 0 | |
v_add_i32_e32 v55, vcc, s14, v8 | |
v_lshlrev_b32_e32 v8, 3, v2 | |
v_add_i32_e32 v56, vcc, s4, v8 | |
s_mov_b32 s46, 0 | |
v_and_b32_e32 v52, 31, v0 | |
v_mov_b32_e32 v53, v47 | |
v_cmp_gt_u32_e64 s[2:3], v1, v2 | |
v_mul_f32_e32 v37, s18, v9 | |
v_add_i32_e32 v6, vcc, s5, v12 | |
v_add_i32_e32 v7, vcc, 0x720, v12 | |
v_add_i32_e32 v12, vcc, 0x820, v12 | |
s_mov_b32 s47, 0xf000 | |
s_mov_b64 s[44:45], 0 | |
s_brev_b32 s27, -2 | |
s_mov_b32 s50, 0x7ffff000 | |
s_brev_b32 s51, 1 | |
v_ashrrev_i32_e32 v58, 31, v38 | |
v_mov_b32_e32 v57, v38 | |
v_or_b32_e32 v38, 7, v41 | |
v_or_b32_e32 v59, 6, v41 | |
v_or_b32_e32 v60, 5, v41 | |
v_or_b32_e32 v61, 4, v41 | |
v_or_b32_e32 v62, 3, v41 | |
v_or_b32_e32 v63, 2, v41 | |
v_or_b32_e32 v64, 1, v41 | |
v_mov_b32_e32 v22, v15 | |
v_mov_b32_e32 v21, v14 | |
v_mov_b32_e32 v20, v13 | |
v_mov_b32_e32 v26, v15 | |
v_mov_b32_e32 v25, v14 | |
v_mov_b32_e32 v24, v13 | |
v_mov_b32_e32 v30, v15 | |
v_mov_b32_e32 v29, v14 | |
v_mov_b32_e32 v28, v13 | |
v_mov_b32_e32 v34, v15 | |
v_mov_b32_e32 v33, v14 | |
v_mov_b32_e32 v32, v13 | |
v_mov_b32_e32 v44, v15 | |
v_mov_b32_e32 v43, v14 | |
v_mov_b32_e32 v42, v13 | |
v_mov_b32_e32 v50, v15 | |
v_mov_b32_e32 v49, v14 | |
v_mov_b32_e32 v48, v13 | |
v_mov_b32_e32 v8, v13 | |
; implicit-def: %VGPR65_VGPR66_VGPR67_VGPR68 | |
s_waitcnt lgkmcnt(0) | |
BB6_11: ; =>This Loop Header: Depth=1 | |
; Child Loop BB6_51 Depth 2 | |
; Child Loop BB6_91 Depth 2 | |
; Child Loop BB6_131 Depth 2 | |
; Child Loop BB6_171 Depth 2 | |
v_lshl_b64 v[65:66], v[57:58], 5 | |
v_add_i32_e32 v69, vcc, s40, v65 | |
v_mov_b32_e32 v65, s41 | |
v_addc_u32_e32 v66, vcc, v66, v65, vcc | |
v_lshl_b64 v[70:71], v[46:47], 3 | |
v_add_i32_e32 v69, vcc, v69, v70 | |
v_addc_u32_e32 v70, vcc, v66, v71, vcc | |
buffer_load_dwordx2 v[69:70], v[69:70], s[44:47], 0 addr64 offset:16 | |
s_waitcnt vmcnt(0) | |
v_cmp_ne_u32_e32 vcc, 0, v69 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[52:53], exec, s[4:5] | |
; mask branch BB6_175 | |
s_cbranch_execz BB6_175 | |
BB6_12: ; in Loop: Header=BB6_11 Depth=1 | |
v_ashrrev_i32_e32 v71, 31, v70 | |
v_lshl_b64 v[65:66], v[70:71], 7 | |
v_add_i32_e32 v70, vcc, s42, v65 | |
v_mov_b32_e32 v65, s43 | |
v_addc_u32_e32 v66, vcc, v66, v65, vcc | |
v_lshl_b64 v[71:72], v[52:53], 2 | |
v_add_i32_e32 v70, vcc, v70, v71 | |
v_addc_u32_e32 v71, vcc, v66, v72, vcc | |
buffer_load_dword v65, v[70:71], s[44:47], 0 addr64 | |
s_and_saveexec_b64 s[4:5], s[48:49] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt vmcnt(0) | |
; mask branch BB6_14 | |
s_cbranch_execz BB6_14 | |
BB6_13: ; in Loop: Header=BB6_11 Depth=1 | |
v_lshl_b64 v[66:67], v[57:58], 5 | |
v_add_i32_e32 v70, vcc, s40, v66 | |
v_mov_b32_e32 v66, s41 | |
v_addc_u32_e32 v67, vcc, v67, v66, vcc | |
v_lshl_b64 v[71:72], v[2:3], 2 | |
v_add_i32_e32 v70, vcc, v70, v71 | |
v_addc_u32_e32 v71, vcc, v67, v72, vcc | |
buffer_load_dword v66, v[70:71], s[44:47], 0 addr64 | |
s_mov_b32 m0, -1 | |
s_waitcnt vmcnt(0) | |
ds_write_b32 v10, v66 | |
s_waitcnt lgkmcnt(0) | |
BB6_14: ; %.preheader.preheader | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_and_b32_e32 v66, 0xff, v69 | |
v_cmp_ne_u32_e32 vcc, 0, v66 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[54:55], exec, s[4:5] | |
; mask branch BB6_54 | |
s_cbranch_execz BB6_54 | |
BB6_15: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v77, v54 | |
s_mov_b64 s[34:35], s[46:47] | |
s_mov_b64 s[38:39], s[46:47] | |
v_mov_b32_e32 v76, 0 | |
v_mov_b32_e32 v81, v76 | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v66, 3, v77 | |
v_add_i32_e32 v66, vcc, v66, v1 | |
v_ashrrev_i32_e32 v67, 31, v66 | |
v_lshl_b64 v[70:71], v[66:67], 4 | |
v_lshl_b64 v[78:79], v[66:67], 3 | |
buffer_load_dwordx4 v[72:75], v[70:71], s[32:35], 0 addr64 | |
buffer_load_dwordx2 v[70:71], v[78:79], s[36:39], 0 addr64 | |
v_and_b32_e32 v67, 1, v69 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
v_mov_b32_e32 v78, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; implicit-def: %VGPR82_VGPR83_VGPR84_VGPR85 | |
s_waitcnt vmcnt(0) | |
; mask branch BB6_19 | |
s_cbranch_execz BB6_19 | |
BB6_16: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset1:1 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v41, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v82, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v79, v74, v88 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mul_f32_e32 v76, s22, v76 | |
v_cmp_lt_f32_e32 vcc, v83, v76 | |
v_mov_b32_e32 v76, 0 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_mov_b32_e32 v78, v76 | |
v_mov_b32_e32 v81, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; implicit-def: %VGPR82_VGPR83_VGPR84_VGPR85 | |
; mask branch BB6_18 | |
s_cbranch_execz BB6_18 | |
BB6_17: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v78, 0x34cd15ae, v83 | |
v_mul_f32_e32 v81, v9, v78 | |
v_mul_f32_e32 v83, v81, v81 | |
v_mov_b32_e32 v84, 0x3a92b707 | |
v_mov_b32_e32 v85, 0x3c739487 | |
v_madak_f32_e32 v84, v84, v83, 0x3ded3cb2 | |
v_mad_f32 v86, v84, v83, 1.0 | |
v_mov_b32_e32 v84, 0xb2951928 | |
v_madak_f32_e32 v85, v85, v83, 0x3f01e2bc | |
v_mac_f32_e32 v86, v81, v85 | |
v_madak_f32_e32 v84, v84, v83, 0xb85ffb93 | |
v_mov_b32_e32 v85, 0x35c55945 | |
v_madak_f32_e32 v85, v85, v83, 0x3a83ca0c | |
v_madak_f32_e32 v84, v84, v83, 0xbc9ded90 | |
s_mov_b32 m0, -1 | |
v_madak_f32_e32 v85, v85, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v87, v84, v83, 0xbf409397 | |
ds_read_b64 v[83:84], v56 | |
v_mac_f32_e32 v87, v81, v85 | |
v_and_b32_e32 v81, 1, v65 | |
v_cmp_eq_u32_e32 vcc, 1, v81 | |
v_mov_b32_e32 v88, 0x3fa00000 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v81, v70, v83 | |
v_mul_f32_e32 v83, v71, v84 | |
v_rsq_f32_e32 v84, v78 | |
v_mul_f32_e32 v78, s18, v78 | |
v_mul_f32_e32 v76, v75, v89 | |
v_mov_b32_e32 v91, 0xbd777f97 | |
v_mul_f32_e32 v78, v84, v78 | |
v_and_b32_e32 v85, s27, v78 | |
v_cmp_gt_f32_e64 s[4:5], v88, v85 | |
v_mul_f32_e32 v88, v85, v85 | |
v_rcp_f32_e32 v89, v88 | |
v_add_f32_e32 v90, -1.0, v85 | |
v_and_b32_e32 v97, s50, v78 | |
v_mov_b32_e32 v98, 0xbf100000 | |
v_cndmask_b32_e64 v89, v89, v90, s[4:5] | |
v_mov_b32_e32 v90, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v90, v85 | |
v_cndmask_b32_e64 v88, v89, v88, s[8:9] | |
v_mov_b32_e32 v90, 0xc11d077e | |
v_mov_b32_e32 v89, 0x4036db6e | |
v_madak_f32_e32 v90, v90, v88, 0xc2a2932b | |
v_cmp_gt_f32_e64 s[10:11], v89, v85 | |
v_mov_b32_e32 v89, 0xc3f1c275 | |
v_madak_f32_e32 v89, v89, v88, 0xc480230b | |
v_madak_f32_e32 v90, v88, v90, 0xc3389ae7 | |
v_madak_f32_e32 v89, v88, v89, 0xc41f6441 | |
v_madak_f32_e32 v90, v88, v90, 0xc322658c | |
v_madak_f32_e32 v89, v88, v89, 0xc320a2ea | |
v_madak_f32_e32 v90, v88, v90, 0xc2798057 | |
v_madak_f32_e32 v89, v88, v89, 0xc18e104b | |
v_madak_f32_e32 v90, v88, v90, 0xc128f022 | |
v_madak_f32_e32 v89, v88, v89, 0xbf4c9dd4 | |
v_madak_f32_e32 v90, v88, v90, 0xbf31a0b7 | |
v_madak_f32_e32 v89, v88, v89, 0xbc21a092 | |
v_madak_f32_e32 v90, v88, v90, 0xbc21a093 | |
v_madak_f32_e32 v91, v91, v88, 0x40d23f7c | |
v_cndmask_b32_e64 v89, v89, v90, s[10:11] | |
v_mov_b32_e32 v90, 0xc1b38712 | |
v_madak_f32_e32 v90, v90, v88, 0x43ed43a7 | |
v_madak_f32_e32 v91, v88, v91, 0x42d9451f | |
v_madak_f32_e32 v90, v88, v90, 0x451f90ce | |
v_madak_f32_e32 v91, v88, v91, 0x43d6810b | |
v_madak_f32_e32 v90, v88, v90, 0x4547fdbb | |
v_madak_f32_e32 v91, v88, v91, 0x442158c9 | |
v_madak_f32_e32 v90, v88, v90, 0x44c01759 | |
v_madak_f32_e32 v91, v88, v91, 0x43d9486f | |
v_madak_f32_e32 v90, v88, v90, 0x43a2e571 | |
v_madak_f32_e32 v91, v88, v91, 0x4309a863 | |
v_madak_f32_e32 v90, v88, v90, 0x41f2b459 | |
v_madak_f32_e32 v91, v88, v91, 0x419d35ce | |
v_cndmask_b32_e64 v90, v90, v91, s[10:11] | |
v_mov_b32_e32 v91, 0xbb0df9c0 | |
v_madak_f32_e32 v91, v91, v88, 0x3d1151b3 | |
v_madak_f32_e32 v91, v88, v91, 0xbde31cc2 | |
v_madak_f32_e32 v91, v88, v91, 0x3ea2fe54 | |
v_madak_f32_e32 v91, v88, v91, 0xbebe9208 | |
v_madak_f32_e32 v91, v88, v91, 0x3ed46805 | |
v_madak_f32_e32 v91, v88, v91, 0xbb1acdc6 | |
v_cndmask_b32_e64 v89, v89, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3c445aa3 | |
v_madak_f32_e32 v91, v91, v88, 0x3c5f6e13 | |
v_madak_f32_e32 v91, v88, v91, 0x3e013307 | |
v_madak_f32_e32 v91, v88, v91, 0x3d931ae7 | |
v_madak_f32_e32 v91, v88, v91, 0x3f0a5785 | |
v_madak_f32_e32 v91, v88, v91, 0x3dd9f331 | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0xb7c756b1 | |
v_madak_f32_e32 v91, v91, v88, 0xbbbd1489 | |
v_madak_f32_e32 v91, v88, v91, 0xbce9528f | |
v_madak_f32_e32 v91, v88, v91, 0xbea66beb | |
v_madak_f32_e32 v91, v88, v91, 0x3e0375d4 | |
v_cndmask_b32_e64 v89, v89, v91, s[8:9] | |
v_mov_b32_e32 v91, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v88, 0x390aee49 | |
v_madak_f32_e32 v91, v88, v91, 0x3ba68116 | |
v_madak_f32_e32 v91, v88, v91, 0x3d852a63 | |
v_madak_f32_e32 v91, v88, v91, 0x3ecbbbce | |
v_cndmask_b32_e64 v90, v90, v91, s[8:9] | |
v_mad_f32 v88, v88, v90, 1.0 | |
v_mov_b32_e32 v90, 0x6f800000 | |
v_cmp_gt_f32_e64 s[10:11], |v88|, v90 | |
v_mov_b32_e32 v91, 0x2f800000 | |
v_cndmask_b32_e64 v92, 1.0, v91, s[10:11] | |
v_mul_f32_e32 v88, v92, v88 | |
v_rcp_f32_e32 v88, v88 | |
v_mad_f32 v98, v97, -v97, v98 | |
v_mov_b32_e32 v100, 0x3fb8aa3b | |
v_mov_b32_e32 v103, 0xbf317180 | |
v_mul_f32_e32 v88, v88, v89 | |
v_cndmask_b32_e64 v89, 0, 1.0, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v98 | |
v_cndmask_b32_e64 v99, 0.5, -0.5, vcc | |
v_mac_f32_e32 v99, v100, v98 | |
v_cvt_i32_f32_e32 v99, v99 | |
v_mov_b32_e32 v105, 0xb717f7d1 | |
v_subrev_f32_e32 v114, v85, v97 | |
v_mul_f32_e32 v102, v88, v92 | |
v_cvt_f32_i32_e32 v101, v99 | |
v_add_f32_e32 v97, v85, v97 | |
v_mad_f32 v97, v97, v114, v102 | |
v_mov_b32_e32 v108, 0xb5ddea0e | |
v_mad_f32 v104, v103, v101, v98 | |
v_mad_f32 v106, v105, v101, v104 | |
v_mul_f32_e32 v107, v106, v106 | |
v_mov_b32_e32 v109, 0x3331bb4c | |
v_cmp_gt_f32_e64 s[10:11], 0, v97 | |
v_mad_f32 v110, v109, v107, v108 | |
v_mov_b32_e32 v111, 0x388ab355 | |
v_cndmask_b32_e64 v114, 0.5, -0.5, s[10:11] | |
v_mad_f32 v110, v110, v107, v111 | |
v_mov_b32_e32 v112, 0xbb360b61 | |
v_mac_f32_e32 v114, v100, v97 | |
v_mad_f32 v110, v110, v107, v112 | |
v_mov_b32_e32 v113, 0x3e2aaaab | |
v_mad_f32 v110, v110, v107, v113 | |
v_cvt_i32_f32_e32 v100, v114 | |
v_mad_f32 v107, -v107, v110, v106 | |
v_sub_f32_e32 v110, 2.0, v107 | |
v_cmp_gt_f32_e64 vcc, |v110|, v90 | |
v_cvt_f32_i32_e32 v115, v100 | |
v_cndmask_b32_e32 v114, 1.0, v91, vcc | |
v_mul_f32_e64 v110, v110, -v114 | |
v_rcp_f32_e32 v110, v110 | |
v_mad_f32 v103, v103, v115, v97 | |
v_mul_f32_e32 v106, v107, v106 | |
v_mad_f32 v107, v105, v115, v103 | |
v_mul_f32_e32 v106, v110, v106 | |
v_mul_f32_e32 v110, v107, v107 | |
v_mac_f32_e32 v108, v109, v110 | |
v_mac_f32_e32 v111, v108, v110 | |
v_mac_f32_e32 v112, v111, v110 | |
v_mac_f32_e32 v113, v112, v110 | |
v_rcp_f32_e32 v86, v86 | |
v_mul_f32_e32 v93, v84, v84 | |
v_mad_f32 v108, -v110, v113, v107 | |
v_mul_f32_e32 v94, v93, v93 | |
v_sub_f32_e32 v109, 2.0, v108 | |
v_mul_f32_e32 v94, v89, v94 | |
v_cmp_gt_f32_e64 vcc, |v109|, v90 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_cndmask_b32_e32 v110, 1.0, v91, vcc | |
v_mad_f32 v96, v95, v95, s26 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mul_f32_e64 v109, v109, -v110 | |
v_mul_f32_e32 v86, v37, v86 | |
v_mul_f32_e32 v106, v106, v114 | |
v_mad_f32 v101, -v101, v105, v106 | |
v_mul_f32_e32 v86, v87, v86 | |
v_mul_f32_e32 v87, v89, v93 | |
v_mul_f32_e32 v96, v96, v83 | |
v_mul_f32_e32 v94, v81, v94 | |
v_rcp_f32_e32 v109, v109 | |
v_mac_f32_e32 v94, 0x3daaaaaa, v96 | |
v_mac_f32_e32 v86, v84, v87 | |
v_subrev_f32_e32 v87, v104, v101 | |
v_mac_f32_e32 v8, v89, v94 | |
v_lshlrev_b32_e32 v94, 23, v99 | |
v_sub_f32_e32 v87, 1.0, v87 | |
v_mul_f32_e32 v106, v108, v107 | |
v_add_i32_e32 v87, vcc, v87, v94 | |
v_mov_b32_e32 v94, 0xc2aeac4f | |
v_mul_f32_e32 v106, v109, v106 | |
v_cmp_nlt_f32_e32 vcc, v98, v94 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_mul_f32_e32 v106, v106, v110 | |
v_cndmask_b32_e32 v87, 0, v87, vcc | |
v_cmp_lt_f32_e32 vcc, v98, v96 | |
v_mov_b32_e32 v99, 0x7f800000 | |
v_cndmask_b32_e32 v87, v99, v87, vcc | |
v_cmp_u_f32_e32 vcc, v98, v98 | |
v_mad_f32 v105, -v115, v105, v106 | |
v_cndmask_b32_e32 v87, v87, v98, vcc | |
v_subrev_f32_e32 v98, v103, v105 | |
v_sub_f32_e32 v98, 1.0, v98 | |
v_lshlrev_b32_e32 v100, 23, v100 | |
v_add_i32_e32 v98, vcc, v98, v100 | |
v_cmp_nlt_f32_e32 vcc, v97, v94 | |
v_cndmask_b32_e32 v94, 0, v98, vcc | |
v_cmp_lt_f32_e32 vcc, v97, v96 | |
v_cndmask_b32_e32 v94, v99, v94, vcc | |
v_cmp_u_f32_e32 vcc, v97, v97 | |
v_cndmask_b32_e32 v94, v94, v97, vcc | |
v_cmp_gt_f32_e64 vcc, |v85|, v90 | |
v_mov_b32_e32 v90, 0x40c00000 | |
v_cmp_gt_f32_e64 s[10:11], v90, v85 | |
v_mov_b32_e32 v90, 0x31800000 | |
v_cmp_gt_f32_e64 s[12:13], v90, v85 | |
v_cndmask_b32_e32 v90, 1.0, v91, vcc | |
v_mul_f32_e32 v85, v90, v85 | |
v_rcp_f32_e32 v85, v85 | |
v_mul_f32_e32 v87, v94, v87 | |
v_cmp_u_f32_e32 vcc, v78, v78 | |
v_mac_f32_e32 v51, v0, v67 | |
v_mul_f32_e32 v85, v85, v87 | |
v_mad_f32 v85, -v90, v85, 1.0 | |
v_madak_f32_e32 v87, v92, v88, 0x3f58560b | |
v_cndmask_b32_e64 v85, 1.0, v85, s[10:11] | |
v_cndmask_b32_e64 v85, v85, v87, s[4:5] | |
v_and_b32_e32 v87, s51, v78 | |
v_or_b32_e32 v85, v87, v85 | |
v_mad_f32 v87, v102, v78, v78 | |
v_cndmask_b32_e64 v85, v85, v87, s[8:9] | |
v_mul_f32_e32 v87, 0x3f8375d4, v78 | |
v_mac_f32_e32 v87, 0x41000000, v78 | |
v_mul_f32_e32 v87, 0x3e000000, v87 | |
v_cndmask_b32_e64 v85, v85, v87, s[12:13] | |
v_cndmask_b32_e32 v78, v85, v78, vcc | |
v_subrev_f32_e32 v78, v78, v89 | |
v_mul_f32_e64 v85, s19, -v89 | |
v_mac_f32_e32 v85, v78, v84 | |
v_mad_f32 v78, v83, v95, -v81 | |
v_mul_f32_e32 v81, v93, v95 | |
v_mul_f32_e32 v83, v78, v81 | |
v_mac_f32_e32 v83, v86, v76 | |
v_mac_f32_e32 v5, v85, v76 | |
v_mad_f32 v50, v79, v83, v50 | |
v_mad_f32 v49, v80, v83, v49 | |
v_mad_f32 v48, v82, v83, v48 | |
v_mul_f32_e64 v81, v83, -v79 | |
v_mul_f32_e64 v78, v83, -v80 | |
v_mul_f32_e64 v76, v83, -v82 | |
v_mul_f32_e64 v85, v67, -v0 | |
BB6_18: ; %Flow1253 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_19: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
v_lshrrev_b32_e32 v67, 1, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_23 | |
s_cbranch_execz BB6_23 | |
BB6_20: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:16 offset1:17 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v64, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB6_22 | |
s_cbranch_execz BB6_22 | |
BB6_21: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 1, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:64 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v90, v9, v86 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v86, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v86, v84 | |
v_mul_f32_e32 v86, v84, v84 | |
v_rcp_f32_e32 v90, v86 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v86, v90, v86, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v86, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v86, 0x43ed43a7 | |
v_madak_f32_e32 v93, v86, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v86, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v86, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v86, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v86, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v86, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v86, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v86, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v86, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v86, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v86, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v86, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v86, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v86, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v86, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v86, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v86, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v86, 0xc480230b | |
v_madak_f32_e32 v92, v86, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v86, 0x390aee49 | |
v_madak_f32_e32 v90, v86, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v86, v92, 0xc322658c | |
v_madak_f32_e32 v93, v86, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v86, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v86, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v86, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v86, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v86, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v86, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v86, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v86, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v86, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v86, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v86, 0x3d1151b3 | |
v_madak_f32_e32 v92, v86, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v86, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v86, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v86, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v86, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v86, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v86, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v86, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v86, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v86, v86, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v86|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v86, v99, v86 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v86, v86 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v86, v86, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v86, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v86, v99, v86, 0x3f58560b | |
v_mad_f32 v81, -v96, v82, v81 | |
v_mad_f32 v44, v82, v96, v44 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v78, -v96, v80, v78 | |
v_mad_f32 v43, v80, v96, v43 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v79, v76 | |
v_mad_f32 v42, v79, v96, v42 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v86, s[4:5] | |
v_and_b32_e32 v86, s51, v83 | |
v_or_b32_e32 v84, v86, v84 | |
v_mad_f32 v86, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v86, s[8:9] | |
v_mul_f32_e32 v86, 0x3f8375d4, v83 | |
v_mac_f32_e32 v86, 0x41000000, v83 | |
v_mul_f32_e32 v86, 0x3e000000, v86 | |
v_cndmask_b32_e64 v84, v84, v86, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v45, v0, v67 | |
BB6_22: ; %Flow1252 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB6_23: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 2, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_27 | |
s_cbranch_execz BB6_27 | |
BB6_24: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:32 offset1:33 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v63, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB6_26 | |
s_cbranch_execz BB6_26 | |
BB6_25: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 2, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:128 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v90, v9, v86 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v86, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v86, v84 | |
v_mul_f32_e32 v86, v84, v84 | |
v_rcp_f32_e32 v90, v86 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v86, v90, v86, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v86, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v86, 0x43ed43a7 | |
v_madak_f32_e32 v93, v86, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v86, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v86, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v86, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v86, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v86, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v86, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v86, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v86, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v86, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v86, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v86, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v86, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v86, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v86, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v86, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v86, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v86, 0xc480230b | |
v_madak_f32_e32 v92, v86, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v86, 0x390aee49 | |
v_madak_f32_e32 v90, v86, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v86, v92, 0xc322658c | |
v_madak_f32_e32 v93, v86, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v86, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v86, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v86, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v86, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v86, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v86, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v86, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v86, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v86, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v86, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v86, 0x3d1151b3 | |
v_madak_f32_e32 v92, v86, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v86, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v86, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v86, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v86, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v86, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v86, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v86, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v86, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v86, v86, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v86|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v86, v99, v86 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v86, v86 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v86, v86, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v86, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v86, v99, v86, 0x3f58560b | |
v_mad_f32 v81, -v96, v82, v81 | |
v_mad_f32 v34, v82, v96, v34 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v78, -v96, v80, v78 | |
v_mad_f32 v33, v80, v96, v33 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v79, v76 | |
v_mad_f32 v32, v79, v96, v32 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v86, s[4:5] | |
v_and_b32_e32 v86, s51, v83 | |
v_or_b32_e32 v84, v86, v84 | |
v_mad_f32 v86, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v86, s[8:9] | |
v_mul_f32_e32 v86, 0x3f8375d4, v83 | |
v_mac_f32_e32 v86, 0x41000000, v83 | |
v_mul_f32_e32 v86, 0x3e000000, v86 | |
v_cndmask_b32_e64 v84, v84, v86, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v35, v0, v67 | |
BB6_26: ; %Flow1251 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB6_27: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 3, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_31 | |
s_cbranch_execz BB6_31 | |
BB6_28: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:48 offset1:49 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v62, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB6_30 | |
s_cbranch_execz BB6_30 | |
BB6_29: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 3, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:192 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v90, v9, v86 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v86, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v86, v84 | |
v_mul_f32_e32 v86, v84, v84 | |
v_rcp_f32_e32 v90, v86 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v86, v90, v86, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v86, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v86, 0x43ed43a7 | |
v_madak_f32_e32 v93, v86, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v86, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v86, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v86, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v86, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v86, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v86, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v86, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v86, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v86, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v86, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v86, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v86, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v86, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v86, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v86, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v86, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v86, 0xc480230b | |
v_madak_f32_e32 v92, v86, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v86, 0x390aee49 | |
v_madak_f32_e32 v90, v86, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v86, v92, 0xc322658c | |
v_madak_f32_e32 v93, v86, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v86, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v86, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v86, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v86, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v86, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v86, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v86, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v86, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v86, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v86, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v86, 0x3d1151b3 | |
v_madak_f32_e32 v92, v86, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v86, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v86, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v86, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v86, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v86, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v86, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v86, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v86, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v86, v86, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v86|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v86, v99, v86 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v86, v86 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v86, v86, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v86, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v86, v99, v86, 0x3f58560b | |
v_mad_f32 v81, -v96, v82, v81 | |
v_mad_f32 v30, v82, v96, v30 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v78, -v96, v80, v78 | |
v_mad_f32 v29, v80, v96, v29 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v79, v76 | |
v_mad_f32 v28, v79, v96, v28 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v86, s[4:5] | |
v_and_b32_e32 v86, s51, v83 | |
v_or_b32_e32 v84, v86, v84 | |
v_mad_f32 v86, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v86, s[8:9] | |
v_mul_f32_e32 v86, 0x3f8375d4, v83 | |
v_mac_f32_e32 v86, 0x41000000, v83 | |
v_mul_f32_e32 v86, 0x3e000000, v86 | |
v_cndmask_b32_e64 v84, v84, v86, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v31, v0, v67 | |
BB6_30: ; %Flow1250 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB6_31: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 4, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_35 | |
s_cbranch_execz BB6_35 | |
BB6_32: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:64 offset1:65 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v61, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB6_34 | |
s_cbranch_execz BB6_34 | |
BB6_33: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 4, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:256 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v90, v9, v86 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v86, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v86, v84 | |
v_mul_f32_e32 v86, v84, v84 | |
v_rcp_f32_e32 v90, v86 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v86, v90, v86, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v86, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v86, 0x43ed43a7 | |
v_madak_f32_e32 v93, v86, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v86, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v86, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v86, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v86, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v86, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v86, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v86, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v86, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v86, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v86, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v86, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v86, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v86, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v86, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v86, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v86, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v86, 0xc480230b | |
v_madak_f32_e32 v92, v86, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v86, 0x390aee49 | |
v_madak_f32_e32 v90, v86, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v86, v92, 0xc322658c | |
v_madak_f32_e32 v93, v86, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v86, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v86, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v86, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v86, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v86, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v86, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v86, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v86, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v86, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v86, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v86, 0x3d1151b3 | |
v_madak_f32_e32 v92, v86, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v86, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v86, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v86, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v86, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v86, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v86, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v86, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v86, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v86, v86, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v86|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v86, v99, v86 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v86, v86 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v86, v86, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v86, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v86, v99, v86, 0x3f58560b | |
v_mad_f32 v81, -v96, v82, v81 | |
v_mad_f32 v26, v82, v96, v26 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v78, -v96, v80, v78 | |
v_mad_f32 v25, v80, v96, v25 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v79, v76 | |
v_mad_f32 v24, v79, v96, v24 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v86, s[4:5] | |
v_and_b32_e32 v86, s51, v83 | |
v_or_b32_e32 v84, v86, v84 | |
v_mad_f32 v86, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v86, s[8:9] | |
v_mul_f32_e32 v86, 0x3f8375d4, v83 | |
v_mac_f32_e32 v86, 0x41000000, v83 | |
v_mul_f32_e32 v86, 0x3e000000, v86 | |
v_cndmask_b32_e64 v84, v84, v86, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v27, v0, v67 | |
BB6_34: ; %Flow1249 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB6_35: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 5, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_39 | |
s_cbranch_execz BB6_39 | |
BB6_36: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:80 offset1:81 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v60, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB6_38 | |
s_cbranch_execz BB6_38 | |
BB6_37: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 5, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:320 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v90, v9, v86 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v86, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v86, v84 | |
v_mul_f32_e32 v86, v84, v84 | |
v_rcp_f32_e32 v90, v86 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v86, v90, v86, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v86, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v86, 0x43ed43a7 | |
v_madak_f32_e32 v93, v86, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v86, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v86, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v86, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v86, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v86, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v86, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v86, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v86, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v86, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v86, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v86, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v86, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v86, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v86, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v86, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v86, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v86, 0xc480230b | |
v_madak_f32_e32 v92, v86, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v86, 0x390aee49 | |
v_madak_f32_e32 v90, v86, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v86, v92, 0xc322658c | |
v_madak_f32_e32 v93, v86, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v86, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v86, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v86, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v86, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v86, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v86, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v86, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v86, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v86, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v86, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v86, 0x3d1151b3 | |
v_madak_f32_e32 v92, v86, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v86, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v86, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v86, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v86, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v86, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v86, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v86, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v86, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v86, v86, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v86|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v86, v99, v86 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v86, v86 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v86, v86, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v86, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v86, v99, v86, 0x3f58560b | |
v_mad_f32 v81, -v96, v82, v81 | |
v_mad_f32 v22, v82, v96, v22 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v78, -v96, v80, v78 | |
v_mad_f32 v21, v80, v96, v21 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v79, v76 | |
v_mad_f32 v20, v79, v96, v20 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v86, s[4:5] | |
v_and_b32_e32 v86, s51, v83 | |
v_or_b32_e32 v84, v86, v84 | |
v_mad_f32 v86, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v86, s[8:9] | |
v_mul_f32_e32 v86, 0x3f8375d4, v83 | |
v_mac_f32_e32 v86, 0x41000000, v83 | |
v_mul_f32_e32 v86, 0x3e000000, v86 | |
v_cndmask_b32_e64 v84, v84, v86, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v23, v0, v67 | |
BB6_38: ; %Flow1248 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB6_39: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 6, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_43 | |
s_cbranch_execz BB6_43 | |
BB6_40: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:96 offset1:97 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v59, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB6_42 | |
s_cbranch_execz BB6_42 | |
BB6_41: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 6, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:384 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v90, v9, v86 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v86, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v86, v84 | |
v_mul_f32_e32 v86, v84, v84 | |
v_rcp_f32_e32 v90, v86 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v86, v90, v86, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v86, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v86, 0x43ed43a7 | |
v_madak_f32_e32 v93, v86, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v86, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v86, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v86, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v86, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v86, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v86, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v86, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v86, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v86, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v86, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v86, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v86, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v86, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v86, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v86, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v86, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v86, 0xc480230b | |
v_madak_f32_e32 v92, v86, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v86, 0x390aee49 | |
v_madak_f32_e32 v90, v86, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v86, v92, 0xc322658c | |
v_madak_f32_e32 v93, v86, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v86, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v86, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v86, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v86, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v86, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v86, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v86, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v86, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v86, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v86, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v86, 0x3d1151b3 | |
v_madak_f32_e32 v92, v86, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v86, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v86, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v86, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v86, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v86, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v86, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v86, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v86, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v86, v86, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v86|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v86, v99, v86 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v86, v86 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v86, v86, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v86, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v86, v99, v86, 0x3f58560b | |
v_mad_f32 v81, -v96, v82, v81 | |
v_mad_f32 v18, v82, v96, v18 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v78, -v96, v80, v78 | |
v_mad_f32 v17, v80, v96, v17 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v79, v76 | |
v_mad_f32 v16, v79, v96, v16 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v86, s[4:5] | |
v_and_b32_e32 v86, s51, v83 | |
v_or_b32_e32 v84, v86, v84 | |
v_mad_f32 v86, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v86, s[8:9] | |
v_mul_f32_e32 v86, 0x3f8375d4, v83 | |
v_mac_f32_e32 v86, 0x41000000, v83 | |
v_mul_f32_e32 v86, 0x3e000000, v86 | |
v_cndmask_b32_e64 v84, v84, v86, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v19, v0, v67 | |
BB6_42: ; %Flow1247 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB6_43: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 7, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB6_47 | |
s_cbranch_execz BB6_47 | |
BB6_44: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:112 offset1:113 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v38, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v73, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v72, v72, v86 | |
v_mul_f32_e32 v77, v73, v73 | |
v_cndmask_b32_e64 v79, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v74, v74, v88 | |
v_mac_f32_e32 v77, v72, v72 | |
v_mac_f32_e32 v77, v74, v74 | |
v_mul_f32_e32 v79, s22, v79 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v77, v79 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_46 | |
s_cbranch_execz BB6_46 | |
BB6_45: ; in Loop: Header=BB6_11 Depth=1 | |
v_lshrrev_b32_e32 v79, 7, v65 | |
v_and_b32_e32 v79, 1, v79 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v79 | |
ds_read_b64 v[79:80], v56 offset:448 | |
v_max_f32_e32 v77, 0x34cd15ae, v77 | |
v_mul_f32_e32 v84, v9, v77 | |
v_mad_f32 v83, -v67, v0, v85 | |
v_mov_b32_e32 v85, 0x3c739487 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v70, v70, v79 | |
v_mul_f32_e32 v71, v71, v80 | |
v_mul_f32_e32 v79, v84, v84 | |
v_mov_b32_e32 v80, 0x3a92b707 | |
v_madak_f32_e32 v80, v80, v79, 0x3ded3cb2 | |
v_rsq_f32_e32 v83, v77 | |
v_madak_f32_e32 v85, v85, v79, 0x3f01e2bc | |
v_mad_f32 v80, v80, v79, 1.0 | |
v_mac_f32_e32 v80, v84, v85 | |
v_mov_b32_e32 v85, 0xb2951928 | |
v_madak_f32_e32 v85, v85, v79, 0xb85ffb93 | |
v_mov_b32_e32 v86, 0x35c55945 | |
v_madak_f32_e32 v86, v86, v79, 0x3a83ca0c | |
v_madak_f32_e32 v85, v85, v79, 0xbc9ded90 | |
v_madak_f32_e32 v86, v86, v79, 0x3d8eaf3b | |
v_madak_f32_e32 v79, v85, v79, 0xbf409397 | |
v_mul_f32_e32 v85, v83, v83 | |
v_mac_f32_e32 v79, v84, v86 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v85, v85 | |
v_rcp_f32_e32 v80, v80 | |
v_mul_f32_e32 v84, v82, v84 | |
v_mul_f32_e32 v86, v85, v84 | |
v_mad_f32 v84, v84, v85, s23 | |
v_mad_f32 v87, v86, v86, s26 | |
v_mul_f32_e32 v84, 0xbe2aaaab, v84 | |
v_mul_f32_e32 v84, v70, v84 | |
v_mul_f32_e32 v87, v87, v71 | |
v_mul_f32_e32 v77, s18, v77 | |
v_mac_f32_e32 v84, 0x3daaaaaa, v87 | |
v_mul_f32_e32 v80, v37, v80 | |
v_mac_f32_e32 v8, v82, v84 | |
v_mul_f32_e32 v77, v83, v77 | |
v_mul_f32_e32 v79, v79, v80 | |
v_mul_f32_e32 v84, v82, v85 | |
v_mac_f32_e32 v79, v83, v84 | |
v_and_b32_e32 v80, s27, v77 | |
v_mov_b32_e32 v84, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v84, v80 | |
v_mul_f32_e32 v84, v80, v80 | |
v_rcp_f32_e32 v87, v84 | |
v_add_f32_e32 v88, -1.0, v80 | |
v_mul_f32_e32 v75, v75, v89 | |
v_mov_b32_e32 v89, 0xbd777f97 | |
v_cndmask_b32_e64 v87, v87, v88, s[4:5] | |
v_mov_b32_e32 v88, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v88, v80 | |
v_cndmask_b32_e64 v84, v87, v84, s[8:9] | |
v_mov_b32_e32 v88, 0xc11d077e | |
v_mov_b32_e32 v87, 0x4036db6e | |
v_madak_f32_e32 v88, v88, v84, 0xc2a2932b | |
v_cmp_gt_f32_e32 vcc, v87, v80 | |
v_mov_b32_e32 v87, 0xc3f1c275 | |
v_madak_f32_e32 v87, v87, v84, 0xc480230b | |
v_madak_f32_e32 v88, v84, v88, 0xc3389ae7 | |
v_madak_f32_e32 v87, v84, v87, 0xc41f6441 | |
v_madak_f32_e32 v88, v84, v88, 0xc322658c | |
v_madak_f32_e32 v87, v84, v87, 0xc320a2ea | |
v_madak_f32_e32 v88, v84, v88, 0xc2798057 | |
v_madak_f32_e32 v87, v84, v87, 0xc18e104b | |
v_madak_f32_e32 v88, v84, v88, 0xc128f022 | |
v_madak_f32_e32 v87, v84, v87, 0xbf4c9dd4 | |
v_madak_f32_e32 v88, v84, v88, 0xbf31a0b7 | |
v_madak_f32_e32 v87, v84, v87, 0xbc21a092 | |
v_madak_f32_e32 v88, v84, v88, 0xbc21a093 | |
v_madak_f32_e32 v89, v89, v84, 0x40d23f7c | |
v_cndmask_b32_e32 v87, v87, v88, vcc | |
v_mov_b32_e32 v88, 0xc1b38712 | |
v_madak_f32_e32 v88, v88, v84, 0x43ed43a7 | |
v_madak_f32_e32 v89, v84, v89, 0x42d9451f | |
v_madak_f32_e32 v88, v84, v88, 0x451f90ce | |
v_madak_f32_e32 v89, v84, v89, 0x43d6810b | |
v_madak_f32_e32 v88, v84, v88, 0x4547fdbb | |
v_madak_f32_e32 v89, v84, v89, 0x442158c9 | |
v_madak_f32_e32 v88, v84, v88, 0x44c01759 | |
v_madak_f32_e32 v89, v84, v89, 0x43d9486f | |
v_madak_f32_e32 v88, v84, v88, 0x43a2e571 | |
v_madak_f32_e32 v89, v84, v89, 0x4309a863 | |
v_madak_f32_e32 v88, v84, v88, 0x41f2b459 | |
v_madak_f32_e32 v89, v84, v89, 0x419d35ce | |
v_cndmask_b32_e32 v88, v88, v89, vcc | |
v_mov_b32_e32 v89, 0xbb0df9c0 | |
v_madak_f32_e32 v89, v89, v84, 0x3d1151b3 | |
v_madak_f32_e32 v89, v84, v89, 0xbde31cc2 | |
v_madak_f32_e32 v89, v84, v89, 0x3ea2fe54 | |
v_madak_f32_e32 v89, v84, v89, 0xbebe9208 | |
v_madak_f32_e32 v89, v84, v89, 0x3ed46805 | |
v_madak_f32_e32 v89, v84, v89, 0xbb1acdc6 | |
v_cndmask_b32_e64 v87, v87, v89, s[4:5] | |
v_mov_b32_e32 v89, 0x3c445aa3 | |
v_madak_f32_e32 v89, v89, v84, 0x3c5f6e13 | |
v_madak_f32_e32 v89, v84, v89, 0x3e013307 | |
v_madak_f32_e32 v89, v84, v89, 0x3d931ae7 | |
v_madak_f32_e32 v89, v84, v89, 0x3f0a5785 | |
v_madak_f32_e32 v89, v84, v89, 0x3dd9f331 | |
v_cndmask_b32_e64 v88, v88, v89, s[4:5] | |
v_mov_b32_e32 v89, 0xb684e21a | |
v_madak_f32_e32 v89, v89, v84, 0x390aee49 | |
v_madak_f32_e32 v89, v84, v89, 0x3ba68116 | |
v_madak_f32_e32 v89, v84, v89, 0x3d852a63 | |
v_madak_f32_e32 v89, v84, v89, 0x3ecbbbce | |
v_cndmask_b32_e64 v88, v88, v89, s[8:9] | |
v_mad_f32 v88, v84, v88, 1.0 | |
v_mov_b32_e32 v90, 0x6f800000 | |
v_and_b32_e32 v93, s50, v77 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_cmp_gt_f32_e64 vcc, |v88|, v90 | |
v_mov_b32_e32 v91, 0x2f800000 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v92, 1.0, v91, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v96, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v96, v94 | |
v_mov_b32_e32 v89, 0xb7c756b1 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v89, v89, v84, 0xbbbd1489 | |
v_madak_f32_e32 v89, v84, v89, 0xbce9528f | |
v_madak_f32_e32 v89, v84, v89, 0xbea66beb | |
v_mul_f32_e32 v88, v92, v88 | |
v_madak_f32_e32 v84, v84, v89, 0x3e0375d4 | |
v_rcp_f32_e32 v88, v88 | |
v_cvt_f32_i32_e32 v89, v95 | |
v_cndmask_b32_e64 v84, v87, v84, s[8:9] | |
v_mov_b32_e32 v87, 0xbf317180 | |
v_mul_f32_e32 v84, v88, v84 | |
v_mad_f32 v88, v87, v89, v94 | |
v_mov_b32_e32 v97, 0xb717f7d1 | |
v_mad_f32 v98, v97, v89, v88 | |
v_mul_f32_e32 v99, v98, v98 | |
v_mov_b32_e32 v100, 0xb5ddea0e | |
v_mov_b32_e32 v101, 0x3331bb4c | |
v_mad_f32 v102, v101, v99, v100 | |
v_mov_b32_e32 v103, 0x388ab355 | |
v_mad_f32 v102, v102, v99, v103 | |
v_mov_b32_e32 v104, 0xbb360b61 | |
v_mad_f32 v102, v102, v99, v104 | |
v_mov_b32_e32 v105, 0x3e2aaaab | |
v_mad_f32 v102, v102, v99, v105 | |
v_mad_f32 v99, -v99, v102, v98 | |
v_mul_f32_e32 v98, v99, v98 | |
v_sub_f32_e32 v99, 2.0, v99 | |
v_cmp_gt_f32_e64 vcc, |v99|, v90 | |
v_cndmask_b32_e32 v102, 1.0, v91, vcc | |
v_mul_f32_e64 v99, v99, -v102 | |
v_rcp_f32_e32 v99, v99 | |
v_mad_f32 v70, v71, v86, -v70 | |
v_mul_f32_e32 v71, v85, v86 | |
v_mul_f32_e32 v70, v70, v71 | |
v_mul_f32_e32 v98, v99, v98 | |
v_mul_f32_e32 v98, v98, v102 | |
v_mad_f32 v89, -v89, v97, v98 | |
v_subrev_f32_e32 v88, v88, v89 | |
v_lshlrev_b32_e32 v89, 23, v95 | |
v_sub_f32_e32 v88, 1.0, v88 | |
v_add_i32_e32 v88, vcc, v88, v89 | |
v_mov_b32_e32 v89, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v89 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v88, 0, v88, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v98, 0x7f800000 | |
v_cndmask_b32_e32 v88, v98, v88, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v88, v88, v94, vcc | |
v_subrev_f32_e32 v94, v80, v93 | |
v_mul_f32_e32 v99, v84, v92 | |
v_add_f32_e32 v93, v80, v93 | |
v_mad_f32 v93, v93, v94, v99 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v96, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v84, v92, v84, 0x3f58560b | |
v_mac_f32_e32 v70, v79, v75 | |
v_mad_f32 v81, -v70, v74, v81 | |
v_cvt_f32_i32_e32 v96, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v15, v74, v70, v15 | |
v_mad_f32 v78, -v70, v73, v78 | |
v_mad_f32 v87, v87, v96, v93 | |
v_mad_f32 v102, v97, v96, v87 | |
v_mul_f32_e32 v106, v102, v102 | |
v_mac_f32_e32 v100, v101, v106 | |
v_mac_f32_e32 v103, v100, v106 | |
v_mac_f32_e32 v104, v103, v106 | |
v_mac_f32_e32 v105, v104, v106 | |
v_mad_f32 v100, -v106, v105, v102 | |
v_mul_f32_e32 v101, v100, v102 | |
v_sub_f32_e32 v100, 2.0, v100 | |
v_cmp_gt_f32_e64 vcc, |v100|, v90 | |
v_cndmask_b32_e32 v102, 1.0, v91, vcc | |
v_mul_f32_e64 v100, v100, -v102 | |
v_rcp_f32_e32 v100, v100 | |
v_mad_f32 v14, v73, v70, v14 | |
v_mad_f32 v76, -v70, v72, v76 | |
v_mad_f32 v13, v72, v70, v13 | |
v_mul_f32_e32 v100, v100, v101 | |
v_mul_f32_e32 v100, v100, v102 | |
v_mad_f32 v96, -v96, v97, v100 | |
v_subrev_f32_e32 v87, v87, v96 | |
v_sub_f32_e32 v87, 1.0, v87 | |
v_add_i32_e32 v87, vcc, v87, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v89 | |
v_cndmask_b32_e32 v87, 0, v87, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v87, v98, v87, vcc | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v87, v87, v93, vcc | |
v_mul_f32_e32 v87, v87, v88 | |
v_mov_b32_e32 v88, 0x40c00000 | |
v_cmp_gt_f32_e64 s[10:11], v88, v80 | |
v_mov_b32_e32 v88, 0x31800000 | |
v_cmp_gt_f32_e64 vcc, |v80|, v90 | |
v_cmp_gt_f32_e64 s[12:13], v88, v80 | |
v_cndmask_b32_e32 v88, 1.0, v91, vcc | |
v_mul_f32_e32 v80, v88, v80 | |
v_rcp_f32_e32 v80, v80 | |
v_cmp_u_f32_e32 vcc, v77, v77 | |
v_mac_f32_e32 v68, v0, v67 | |
v_mul_f32_e32 v80, v80, v87 | |
v_mad_f32 v80, -v88, v80, 1.0 | |
v_cndmask_b32_e64 v80, 1.0, v80, s[10:11] | |
v_cndmask_b32_e64 v80, v80, v84, s[4:5] | |
v_and_b32_e32 v84, s51, v77 | |
v_or_b32_e32 v80, v84, v80 | |
v_mad_f32 v84, v99, v77, v77 | |
v_cndmask_b32_e64 v80, v80, v84, s[8:9] | |
v_mul_f32_e32 v84, 0x3f8375d4, v77 | |
v_mac_f32_e32 v84, 0x41000000, v77 | |
v_mul_f32_e32 v84, 0x3e000000, v84 | |
v_cndmask_b32_e64 v80, v80, v84, s[12:13] | |
v_cndmask_b32_e32 v77, v80, v77, vcc | |
v_subrev_f32_e32 v77, v77, v82 | |
v_mul_f32_e64 v80, s19, -v82 | |
v_mac_f32_e32 v80, v77, v83 | |
v_mac_f32_e32 v5, v80, v75 | |
BB6_46: ; %Flow1246 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_47: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v6, v76 | |
ds_write_b32 v7, v78 | |
ds_write_b32 v12, v81 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_53 | |
s_cbranch_execz BB6_53 | |
BB6_48: ; in Loop: Header=BB6_11 Depth=1 | |
v_lshlrev_b32_e32 v70, 6, v2 | |
v_add_i32_e32 v67, vcc, v11, v70 | |
v_lshlrev_b32_e32 v67, 2, v67 | |
v_add_i32_e32 v71, vcc, s15, v67 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v67, v71 | |
v_add_i32_e32 v72, vcc, 8, v11 | |
v_or_b32_e32 v73, 1, v11 | |
v_cmp_lt_i32_e32 vcc, v73, v72 | |
s_and_saveexec_b64 s[8:9], vcc | |
s_xor_b64 s[8:9], exec, s[8:9] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_50 | |
s_cbranch_execz BB6_50 | |
BB6_49: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[72:73], v71 offset0:1 offset1:2 | |
v_or_b32_e32 v76, 3, v11 | |
v_add_i32_e32 v70, vcc, v76, v70 | |
v_lshlrev_b32_e32 v70, 2, v70 | |
ds_read2_b32 v[74:75], v71 offset0:3 offset1:4 | |
v_add_i32_e32 v70, vcc, s15, v70 | |
ds_read_b32 v77, v71 offset:28 | |
ds_read2_b32 v[70:71], v70 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v67, v67, v72 | |
v_add_f32_e32 v67, v73, v67 | |
v_add_f32_e32 v67, v74, v67 | |
v_add_f32_e32 v67, v75, v67 | |
v_add_f32_e32 v67, v70, v67 | |
v_add_f32_e32 v67, v71, v67 | |
v_add_f32_e32 v67, v77, v67 | |
BB6_50: ; %._crit_edge.i118 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
v_mul_lo_i32 v66, v66, 3 | |
v_mov_b32_e32 v74, s29 | |
s_mov_b64 s[30:31], s[46:47] | |
s_mov_b64 s[8:9], 0 | |
v_add_i32_e32 v70, vcc, v66, v2 | |
v_ashrrev_i32_e32 v71, 31, v70 | |
v_lshl_b64 v[72:73], v[70:71], 2 | |
v_add_i32_e32 v70, vcc, s28, v72 | |
v_addc_u32_e32 v71, vcc, v73, v74, vcc | |
buffer_load_dword v73, v[72:73], s[28:31], 0 addr64 | |
s_waitcnt vmcnt(0) | |
BB6_51: ; Parent Loop BB6_11 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v72, v67, v73 | |
v_mov_b32_e32 v75, v73 | |
v_mov_b32_e32 v74, v72 | |
buffer_atomic_cmpswap v[74:75], v[70:71], s[44:47], 0 addr64 glc | |
v_mov_b32_e32 v66, -1 | |
v_mov_b32_e32 v66, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v74, v73 | |
s_or_b64 s[8:9], vcc, s[8:9] | |
v_mov_b32_e32 v73, v74 | |
s_andn2_b64 exec, exec, s[8:9] | |
s_cbranch_execnz BB6_51 | |
; BB#52: ; %Flow1244 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
BB6_53: ; %Flow1245 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
BB6_54: ; %Flow1254 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[54:55] | |
v_and_b32_e32 v66, 0xff00, v69 | |
v_cmp_ne_u32_e32 vcc, 0, v66 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB6_94 | |
s_cbranch_execz BB6_94 | |
BB6_55: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v77, v54 offset:4 | |
s_mov_b64 s[8:9], s[32:33] | |
s_mov_b64 s[10:11], s[46:47] | |
v_mov_b32_e32 v76, 0 | |
v_mov_b32_e32 v82, v76 | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v66, 3, v77 | |
v_add_i32_e32 v66, vcc, v66, v1 | |
v_ashrrev_i32_e32 v67, 31, v66 | |
v_lshl_b64 v[70:71], v[66:67], 4 | |
v_lshl_b64 v[78:79], v[66:67], 3 | |
buffer_load_dwordx4 v[72:75], v[70:71], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], s[36:37] | |
buffer_load_dwordx2 v[70:71], v[78:79], s[8:11], 0 addr64 | |
v_lshrrev_b32_e32 v67, 8, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
v_mov_b32_e32 v79, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; implicit-def: %VGPR83_VGPR84_VGPR85_VGPR86 | |
s_waitcnt vmcnt(0) | |
; mask branch BB6_59 | |
s_cbranch_execz BB6_59 | |
BB6_56: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset1:1 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v41, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v81, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v78, v74, v89 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mul_f32_e32 v76, s22, v76 | |
v_cmp_lt_f32_e32 vcc, v83, v76 | |
v_mov_b32_e32 v76, 0 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_mov_b32_e32 v79, v76 | |
v_mov_b32_e32 v82, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[38:39], exec, s[4:5] | |
; implicit-def: %VGPR83_VGPR84_VGPR85_VGPR86 | |
; mask branch BB6_58 | |
s_cbranch_execz BB6_58 | |
BB6_57: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v79, 0x34cd15ae, v83 | |
v_mul_f32_e32 v82, v9, v79 | |
v_mul_f32_e32 v83, v82, v82 | |
v_mov_b32_e32 v84, 0x3a92b707 | |
v_madak_f32_e32 v84, v84, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v85, 0x3c739487 | |
v_madak_f32_e32 v85, v85, v83, 0x3f01e2bc | |
v_mad_f32 v84, v84, v83, 1.0 | |
v_mac_f32_e32 v84, v82, v85 | |
v_mov_b32_e32 v85, 0xb2951928 | |
v_madak_f32_e32 v85, v85, v83, 0xb85ffb93 | |
v_mov_b32_e32 v86, 0x35c55945 | |
v_madak_f32_e32 v86, v86, v83, 0x3a83ca0c | |
v_madak_f32_e32 v85, v85, v83, 0xbc9ded90 | |
v_madak_f32_e32 v86, v86, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v85, v85, v83, 0xbf409397 | |
v_mac_f32_e32 v85, v82, v86 | |
v_rsq_f32_e32 v86, v79 | |
v_mul_f32_e32 v79, s18, v79 | |
v_mov_b32_e32 v88, 0x3fa00000 | |
v_mul_f32_e32 v76, v75, v90 | |
v_mul_f32_e32 v79, v86, v79 | |
v_and_b32_e32 v87, s27, v79 | |
v_cmp_gt_f32_e64 s[4:5], v88, v87 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v89, v88 | |
v_add_f32_e32 v90, -1.0, v87 | |
v_mov_b32_e32 v91, 0xbd777f97 | |
v_lshrrev_b32_e32 v82, 8, v65 | |
v_cndmask_b32_e64 v89, v89, v90, s[4:5] | |
v_mov_b32_e32 v90, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v90, v87 | |
v_cndmask_b32_e64 v88, v89, v88, s[8:9] | |
v_mov_b32_e32 v90, 0xc11d077e | |
v_mov_b32_e32 v89, 0x4036db6e | |
v_madak_f32_e32 v90, v90, v88, 0xc2a2932b | |
v_cmp_gt_f32_e64 s[10:11], v89, v87 | |
v_mov_b32_e32 v89, 0xc3f1c275 | |
v_madak_f32_e32 v89, v89, v88, 0xc480230b | |
v_madak_f32_e32 v90, v88, v90, 0xc3389ae7 | |
v_madak_f32_e32 v89, v88, v89, 0xc41f6441 | |
v_madak_f32_e32 v90, v88, v90, 0xc322658c | |
v_madak_f32_e32 v89, v88, v89, 0xc320a2ea | |
v_madak_f32_e32 v90, v88, v90, 0xc2798057 | |
v_madak_f32_e32 v89, v88, v89, 0xc18e104b | |
v_madak_f32_e32 v90, v88, v90, 0xc128f022 | |
v_madak_f32_e32 v89, v88, v89, 0xbf4c9dd4 | |
v_madak_f32_e32 v90, v88, v90, 0xbf31a0b7 | |
v_madak_f32_e32 v89, v88, v89, 0xbc21a092 | |
v_madak_f32_e32 v90, v88, v90, 0xbc21a093 | |
v_madak_f32_e32 v91, v91, v88, 0x40d23f7c | |
v_cndmask_b32_e64 v89, v89, v90, s[10:11] | |
v_mov_b32_e32 v90, 0xc1b38712 | |
v_madak_f32_e32 v90, v90, v88, 0x43ed43a7 | |
v_madak_f32_e32 v91, v88, v91, 0x42d9451f | |
v_madak_f32_e32 v90, v88, v90, 0x451f90ce | |
v_madak_f32_e32 v91, v88, v91, 0x43d6810b | |
v_madak_f32_e32 v90, v88, v90, 0x4547fdbb | |
v_madak_f32_e32 v91, v88, v91, 0x442158c9 | |
v_madak_f32_e32 v90, v88, v90, 0x44c01759 | |
v_madak_f32_e32 v91, v88, v91, 0x43d9486f | |
v_madak_f32_e32 v90, v88, v90, 0x43a2e571 | |
v_madak_f32_e32 v91, v88, v91, 0x4309a863 | |
v_madak_f32_e32 v90, v88, v90, 0x41f2b459 | |
v_madak_f32_e32 v91, v88, v91, 0x419d35ce | |
v_cndmask_b32_e64 v90, v90, v91, s[10:11] | |
v_mov_b32_e32 v91, 0xbb0df9c0 | |
v_madak_f32_e32 v91, v91, v88, 0x3d1151b3 | |
v_madak_f32_e32 v91, v88, v91, 0xbde31cc2 | |
v_madak_f32_e32 v91, v88, v91, 0x3ea2fe54 | |
v_madak_f32_e32 v91, v88, v91, 0xbebe9208 | |
v_madak_f32_e32 v91, v88, v91, 0x3ed46805 | |
v_madak_f32_e32 v91, v88, v91, 0xbb1acdc6 | |
v_cndmask_b32_e64 v89, v89, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3c445aa3 | |
v_madak_f32_e32 v91, v91, v88, 0x3c5f6e13 | |
v_madak_f32_e32 v91, v88, v91, 0x3e013307 | |
v_madak_f32_e32 v91, v88, v91, 0x3d931ae7 | |
v_madak_f32_e32 v91, v88, v91, 0x3f0a5785 | |
v_madak_f32_e32 v91, v88, v91, 0x3dd9f331 | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0xb7c756b1 | |
v_madak_f32_e32 v91, v91, v88, 0xbbbd1489 | |
v_madak_f32_e32 v91, v88, v91, 0xbce9528f | |
v_madak_f32_e32 v91, v88, v91, 0xbea66beb | |
v_madak_f32_e32 v91, v88, v91, 0x3e0375d4 | |
v_cndmask_b32_e64 v89, v89, v91, s[8:9] | |
v_mov_b32_e32 v91, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v88, 0x390aee49 | |
v_madak_f32_e32 v91, v88, v91, 0x3ba68116 | |
v_madak_f32_e32 v91, v88, v91, 0x3d852a63 | |
v_madak_f32_e32 v91, v88, v91, 0x3ecbbbce | |
v_cndmask_b32_e64 v90, v90, v91, s[8:9] | |
v_mad_f32 v88, v88, v90, 1.0 | |
v_mov_b32_e32 v90, 0x6f800000 | |
v_cmp_gt_f32_e64 s[10:11], |v88|, v90 | |
v_mov_b32_e32 v91, 0x2f800000 | |
v_cndmask_b32_e64 v92, 1.0, v91, s[10:11] | |
v_mul_f32_e32 v88, v92, v88 | |
v_rcp_f32_e32 v88, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_and_b32_e32 v97, s50, v79 | |
v_mov_b32_e32 v98, 0xbf100000 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mad_f32 v98, v97, -v97, v98 | |
v_mul_f32_e32 v88, v88, v89 | |
v_cndmask_b32_e64 v89, 0, 1.0, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v98 | |
v_cndmask_b32_e64 v99, 0.5, -0.5, vcc | |
v_mov_b32_e32 v100, 0x3fb8aa3b | |
v_mac_f32_e32 v99, v100, v98 | |
v_cvt_i32_f32_e32 v99, v99 | |
v_mov_b32_e32 v103, 0xbf317180 | |
v_mov_b32_e32 v105, 0xb717f7d1 | |
v_subrev_f32_e32 v114, v87, v97 | |
v_cvt_f32_i32_e32 v101, v99 | |
v_mul_f32_e32 v102, v88, v92 | |
v_add_f32_e32 v97, v87, v97 | |
v_mad_f32 v97, v97, v114, v102 | |
v_mad_f32 v104, v103, v101, v98 | |
v_mad_f32 v106, v105, v101, v104 | |
v_mul_f32_e32 v107, v106, v106 | |
v_mov_b32_e32 v108, 0xb5ddea0e | |
v_mov_b32_e32 v109, 0x3331bb4c | |
v_cmp_gt_f32_e64 s[10:11], 0, v97 | |
v_mad_f32 v110, v109, v107, v108 | |
v_mov_b32_e32 v111, 0x388ab355 | |
v_cndmask_b32_e64 v114, 0.5, -0.5, s[10:11] | |
v_mad_f32 v110, v110, v107, v111 | |
v_mov_b32_e32 v112, 0xbb360b61 | |
v_mac_f32_e32 v114, v100, v97 | |
v_mad_f32 v110, v110, v107, v112 | |
v_mov_b32_e32 v113, 0x3e2aaaab | |
v_mad_f32 v110, v110, v107, v113 | |
v_cvt_i32_f32_e32 v100, v114 | |
v_mad_f32 v107, -v107, v110, v106 | |
v_sub_f32_e32 v110, 2.0, v107 | |
v_cmp_gt_f32_e64 vcc, |v110|, v90 | |
v_cvt_f32_i32_e32 v115, v100 | |
v_cndmask_b32_e32 v114, 1.0, v91, vcc | |
v_mul_f32_e64 v110, v110, -v114 | |
v_rcp_f32_e32 v110, v110 | |
v_mad_f32 v103, v103, v115, v97 | |
v_mul_f32_e32 v106, v107, v106 | |
v_mad_f32 v107, v105, v115, v103 | |
v_mul_f32_e32 v106, v110, v106 | |
v_mul_f32_e32 v110, v107, v107 | |
v_mac_f32_e32 v108, v109, v110 | |
v_mac_f32_e32 v111, v108, v110 | |
s_mov_b32 m0, -1 | |
v_mac_f32_e32 v112, v111, v110 | |
ds_read_b64 v[82:83], v56 | |
v_mac_f32_e32 v113, v112, v110 | |
v_rcp_f32_e32 v84, v84 | |
v_mul_f32_e32 v93, v86, v86 | |
v_mad_f32 v108, -v110, v113, v107 | |
v_mul_f32_e32 v94, v93, v93 | |
v_sub_f32_e32 v109, 2.0, v108 | |
v_mul_f32_e32 v94, v89, v94 | |
v_cmp_gt_f32_e64 vcc, |v109|, v90 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_cndmask_b32_e32 v110, 1.0, v91, vcc | |
v_mul_f32_e64 v109, v109, -v110 | |
v_mul_f32_e32 v84, v37, v84 | |
v_mul_f32_e32 v106, v106, v114 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v70, v82 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mul_f32_e32 v83, v71, v83 | |
v_mad_f32 v96, v95, v95, s26 | |
v_mad_f32 v101, -v101, v105, v106 | |
v_mul_f32_e32 v84, v85, v84 | |
v_mul_f32_e32 v85, v89, v93 | |
v_mul_f32_e32 v96, v96, v83 | |
v_mul_f32_e32 v94, v82, v94 | |
v_rcp_f32_e32 v109, v109 | |
v_mac_f32_e32 v94, 0x3daaaaaa, v96 | |
v_mac_f32_e32 v84, v86, v85 | |
v_subrev_f32_e32 v85, v104, v101 | |
v_mac_f32_e32 v8, v89, v94 | |
v_lshlrev_b32_e32 v94, 23, v99 | |
v_sub_f32_e32 v85, 1.0, v85 | |
v_mul_f32_e32 v106, v108, v107 | |
v_add_i32_e32 v85, vcc, v85, v94 | |
v_mov_b32_e32 v94, 0xc2aeac4f | |
v_mul_f32_e32 v106, v109, v106 | |
v_cmp_nlt_f32_e32 vcc, v98, v94 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_mul_f32_e32 v106, v106, v110 | |
v_cndmask_b32_e32 v85, 0, v85, vcc | |
v_cmp_lt_f32_e32 vcc, v98, v96 | |
v_mov_b32_e32 v99, 0x7f800000 | |
v_cndmask_b32_e32 v85, v99, v85, vcc | |
v_cmp_u_f32_e32 vcc, v98, v98 | |
v_mad_f32 v105, -v115, v105, v106 | |
v_cndmask_b32_e32 v85, v85, v98, vcc | |
v_subrev_f32_e32 v98, v103, v105 | |
v_sub_f32_e32 v98, 1.0, v98 | |
v_lshlrev_b32_e32 v100, 23, v100 | |
v_add_i32_e32 v98, vcc, v98, v100 | |
v_cmp_nlt_f32_e32 vcc, v97, v94 | |
v_cndmask_b32_e32 v94, 0, v98, vcc | |
v_cmp_lt_f32_e32 vcc, v97, v96 | |
v_cndmask_b32_e32 v94, v99, v94, vcc | |
v_cmp_u_f32_e32 vcc, v97, v97 | |
v_cndmask_b32_e32 v94, v94, v97, vcc | |
v_cmp_gt_f32_e64 vcc, |v87|, v90 | |
v_mov_b32_e32 v90, 0x40c00000 | |
v_cmp_gt_f32_e64 s[10:11], v90, v87 | |
v_mov_b32_e32 v90, 0x31800000 | |
v_cmp_gt_f32_e64 s[12:13], v90, v87 | |
v_cndmask_b32_e32 v90, 1.0, v91, vcc | |
v_mul_f32_e32 v87, v90, v87 | |
v_rcp_f32_e32 v87, v87 | |
v_mul_f32_e32 v85, v94, v85 | |
v_cmp_u_f32_e32 vcc, v79, v79 | |
v_mac_f32_e32 v51, v0, v67 | |
v_mul_f32_e32 v85, v87, v85 | |
v_mad_f32 v85, -v90, v85, 1.0 | |
v_madak_f32_e32 v87, v92, v88, 0x3f58560b | |
v_cndmask_b32_e64 v85, 1.0, v85, s[10:11] | |
v_cndmask_b32_e64 v85, v85, v87, s[4:5] | |
v_and_b32_e32 v87, s51, v79 | |
v_or_b32_e32 v85, v87, v85 | |
v_mad_f32 v87, v102, v79, v79 | |
v_cndmask_b32_e64 v85, v85, v87, s[8:9] | |
v_mul_f32_e32 v87, 0x3f8375d4, v79 | |
v_mac_f32_e32 v87, 0x41000000, v79 | |
v_mul_f32_e32 v87, 0x3e000000, v87 | |
v_cndmask_b32_e64 v85, v85, v87, s[12:13] | |
v_cndmask_b32_e32 v79, v85, v79, vcc | |
v_subrev_f32_e32 v79, v79, v89 | |
v_mul_f32_e64 v85, s19, -v89 | |
v_mac_f32_e32 v85, v79, v86 | |
v_mad_f32 v79, v83, v95, -v82 | |
v_mul_f32_e32 v82, v93, v95 | |
v_mul_f32_e32 v83, v79, v82 | |
v_mac_f32_e32 v83, v84, v76 | |
v_mac_f32_e32 v5, v85, v76 | |
v_mad_f32 v50, v78, v83, v50 | |
v_mad_f32 v49, v80, v83, v49 | |
v_mad_f32 v48, v81, v83, v48 | |
v_mul_f32_e64 v82, v83, -v78 | |
v_mul_f32_e64 v79, v83, -v80 | |
v_mul_f32_e64 v76, v83, -v81 | |
v_mul_f32_e64 v86, v67, -v0 | |
BB6_58: ; %Flow1242 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[38:39] | |
BB6_59: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
v_lshrrev_b32_e32 v67, 9, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_63 | |
s_cbranch_execz BB6_63 | |
BB6_60: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:16 offset1:17 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v64, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_62 | |
s_cbranch_execz BB6_62 | |
BB6_61: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 9, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:64 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v44, v81, v96, v44 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v43, v80, v96, v43 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v42, v78, v96, v42 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v45, v0, v67 | |
BB6_62: ; %Flow1241 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_63: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 10, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_67 | |
s_cbranch_execz BB6_67 | |
BB6_64: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:32 offset1:33 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v63, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_66 | |
s_cbranch_execz BB6_66 | |
BB6_65: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 10, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:128 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v34, v81, v96, v34 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v33, v80, v96, v33 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v32, v78, v96, v32 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v35, v0, v67 | |
BB6_66: ; %Flow1240 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_67: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 11, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_71 | |
s_cbranch_execz BB6_71 | |
BB6_68: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:48 offset1:49 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v62, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_70 | |
s_cbranch_execz BB6_70 | |
BB6_69: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 11, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:192 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v30, v81, v96, v30 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v29, v80, v96, v29 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v28, v78, v96, v28 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v31, v0, v67 | |
BB6_70: ; %Flow1239 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_71: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 12, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_75 | |
s_cbranch_execz BB6_75 | |
BB6_72: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:64 offset1:65 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v61, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_74 | |
s_cbranch_execz BB6_74 | |
BB6_73: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 12, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:256 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v26, v81, v96, v26 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v25, v80, v96, v25 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v24, v78, v96, v24 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v27, v0, v67 | |
BB6_74: ; %Flow1238 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_75: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 13, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_79 | |
s_cbranch_execz BB6_79 | |
BB6_76: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:80 offset1:81 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v60, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_78 | |
s_cbranch_execz BB6_78 | |
BB6_77: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 13, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:320 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v22, v81, v96, v22 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v21, v80, v96, v21 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v20, v78, v96, v20 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v23, v0, v67 | |
BB6_78: ; %Flow1237 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_79: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 14, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_83 | |
s_cbranch_execz BB6_83 | |
BB6_80: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:96 offset1:97 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v59, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_82 | |
s_cbranch_execz BB6_82 | |
BB6_81: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 14, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:384 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v18, v81, v96, v18 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v17, v80, v96, v17 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v16, v78, v96, v16 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v19, v0, v67 | |
BB6_82: ; %Flow1236 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_83: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 15, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_87 | |
s_cbranch_execz BB6_87 | |
BB6_84: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:112 offset1:113 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v38, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v73, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mul_f32_e32 v77, v73, v73 | |
v_cndmask_b32_e64 v78, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v74, v74, v89 | |
v_mac_f32_e32 v77, v72, v72 | |
v_mac_f32_e32 v77, v74, v74 | |
v_mul_f32_e32 v78, s22, v78 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v77, v78 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[38:39], exec, s[4:5] | |
; mask branch BB6_86 | |
s_cbranch_execz BB6_86 | |
BB6_85: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b64 v[80:81], v56 offset:448 | |
v_max_f32_e32 v77, 0x34cd15ae, v77 | |
v_mul_f32_e32 v84, v9, v77 | |
v_mad_f32 v83, -v67, v0, v86 | |
v_mov_b32_e32 v85, 0x3c739487 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v70, v70, v80 | |
v_mul_f32_e32 v71, v71, v81 | |
v_mul_f32_e32 v80, v84, v84 | |
v_mov_b32_e32 v81, 0x3a92b707 | |
v_madak_f32_e32 v81, v81, v80, 0x3ded3cb2 | |
v_rsq_f32_e32 v83, v77 | |
v_madak_f32_e32 v85, v85, v80, 0x3f01e2bc | |
v_mad_f32 v81, v81, v80, 1.0 | |
v_mac_f32_e32 v81, v84, v85 | |
v_mov_b32_e32 v85, 0xb2951928 | |
v_lshrrev_b32_e32 v78, 15, v65 | |
v_madak_f32_e32 v85, v85, v80, 0xb85ffb93 | |
v_mov_b32_e32 v86, 0x35c55945 | |
v_madak_f32_e32 v86, v86, v80, 0x3a83ca0c | |
v_and_b32_e32 v78, 1, v78 | |
v_madak_f32_e32 v85, v85, v80, 0xbc9ded90 | |
v_cmp_eq_u32_e32 vcc, 1, v78 | |
v_madak_f32_e32 v86, v86, v80, 0x3d8eaf3b | |
v_madak_f32_e32 v80, v85, v80, 0xbf409397 | |
v_mul_f32_e32 v85, v83, v83 | |
v_mac_f32_e32 v80, v84, v86 | |
v_cndmask_b32_e64 v78, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v85, v85 | |
v_rcp_f32_e32 v81, v81 | |
v_mul_f32_e32 v84, v78, v84 | |
v_mul_f32_e32 v86, v85, v84 | |
v_mad_f32 v84, v84, v85, s23 | |
v_mad_f32 v87, v86, v86, s26 | |
v_mul_f32_e32 v84, 0xbe2aaaab, v84 | |
v_mul_f32_e32 v84, v70, v84 | |
v_mul_f32_e32 v87, v87, v71 | |
v_mul_f32_e32 v77, s18, v77 | |
v_mac_f32_e32 v84, 0x3daaaaaa, v87 | |
v_mul_f32_e32 v81, v37, v81 | |
v_mac_f32_e32 v8, v78, v84 | |
v_mul_f32_e32 v77, v83, v77 | |
v_mul_f32_e32 v80, v80, v81 | |
v_mul_f32_e32 v84, v78, v85 | |
v_mac_f32_e32 v80, v83, v84 | |
v_and_b32_e32 v81, s27, v77 | |
v_mov_b32_e32 v84, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v84, v81 | |
v_mul_f32_e32 v84, v81, v81 | |
v_rcp_f32_e32 v87, v84 | |
v_add_f32_e32 v88, -1.0, v81 | |
v_mov_b32_e32 v89, 0xbd777f97 | |
v_mul_f32_e32 v75, v75, v90 | |
v_cndmask_b32_e64 v87, v87, v88, s[4:5] | |
v_mov_b32_e32 v88, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v88, v81 | |
v_cndmask_b32_e64 v84, v87, v84, s[8:9] | |
v_mov_b32_e32 v88, 0xc11d077e | |
v_mov_b32_e32 v87, 0x4036db6e | |
v_madak_f32_e32 v88, v88, v84, 0xc2a2932b | |
v_cmp_gt_f32_e32 vcc, v87, v81 | |
v_mov_b32_e32 v87, 0xc3f1c275 | |
v_madak_f32_e32 v87, v87, v84, 0xc480230b | |
v_madak_f32_e32 v88, v84, v88, 0xc3389ae7 | |
v_madak_f32_e32 v87, v84, v87, 0xc41f6441 | |
v_madak_f32_e32 v88, v84, v88, 0xc322658c | |
v_madak_f32_e32 v87, v84, v87, 0xc320a2ea | |
v_madak_f32_e32 v88, v84, v88, 0xc2798057 | |
v_madak_f32_e32 v87, v84, v87, 0xc18e104b | |
v_madak_f32_e32 v88, v84, v88, 0xc128f022 | |
v_madak_f32_e32 v87, v84, v87, 0xbf4c9dd4 | |
v_madak_f32_e32 v88, v84, v88, 0xbf31a0b7 | |
v_madak_f32_e32 v87, v84, v87, 0xbc21a092 | |
v_madak_f32_e32 v88, v84, v88, 0xbc21a093 | |
v_madak_f32_e32 v89, v89, v84, 0x40d23f7c | |
v_cndmask_b32_e32 v87, v87, v88, vcc | |
v_mov_b32_e32 v88, 0xc1b38712 | |
v_madak_f32_e32 v88, v88, v84, 0x43ed43a7 | |
v_madak_f32_e32 v89, v84, v89, 0x42d9451f | |
v_madak_f32_e32 v88, v84, v88, 0x451f90ce | |
v_madak_f32_e32 v89, v84, v89, 0x43d6810b | |
v_madak_f32_e32 v88, v84, v88, 0x4547fdbb | |
v_madak_f32_e32 v89, v84, v89, 0x442158c9 | |
v_madak_f32_e32 v88, v84, v88, 0x44c01759 | |
v_madak_f32_e32 v89, v84, v89, 0x43d9486f | |
v_madak_f32_e32 v88, v84, v88, 0x43a2e571 | |
v_madak_f32_e32 v89, v84, v89, 0x4309a863 | |
v_madak_f32_e32 v88, v84, v88, 0x41f2b459 | |
v_madak_f32_e32 v89, v84, v89, 0x419d35ce | |
v_cndmask_b32_e32 v88, v88, v89, vcc | |
v_mov_b32_e32 v89, 0xbb0df9c0 | |
v_madak_f32_e32 v89, v89, v84, 0x3d1151b3 | |
v_madak_f32_e32 v89, v84, v89, 0xbde31cc2 | |
v_madak_f32_e32 v89, v84, v89, 0x3ea2fe54 | |
v_madak_f32_e32 v89, v84, v89, 0xbebe9208 | |
v_madak_f32_e32 v89, v84, v89, 0x3ed46805 | |
v_madak_f32_e32 v89, v84, v89, 0xbb1acdc6 | |
v_cndmask_b32_e64 v87, v87, v89, s[4:5] | |
v_mov_b32_e32 v89, 0x3c445aa3 | |
v_madak_f32_e32 v89, v89, v84, 0x3c5f6e13 | |
v_madak_f32_e32 v89, v84, v89, 0x3e013307 | |
v_madak_f32_e32 v89, v84, v89, 0x3d931ae7 | |
v_madak_f32_e32 v89, v84, v89, 0x3f0a5785 | |
v_madak_f32_e32 v89, v84, v89, 0x3dd9f331 | |
v_cndmask_b32_e64 v88, v88, v89, s[4:5] | |
v_mov_b32_e32 v89, 0xb684e21a | |
v_madak_f32_e32 v89, v89, v84, 0x390aee49 | |
v_madak_f32_e32 v89, v84, v89, 0x3ba68116 | |
v_madak_f32_e32 v89, v84, v89, 0x3d852a63 | |
v_madak_f32_e32 v89, v84, v89, 0x3ecbbbce | |
v_cndmask_b32_e64 v88, v88, v89, s[8:9] | |
v_mad_f32 v88, v84, v88, 1.0 | |
v_mov_b32_e32 v90, 0x6f800000 | |
v_and_b32_e32 v93, s50, v77 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_cmp_gt_f32_e64 vcc, |v88|, v90 | |
v_mov_b32_e32 v91, 0x2f800000 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v92, 1.0, v91, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v96, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v96, v94 | |
v_mov_b32_e32 v89, 0xb7c756b1 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v89, v89, v84, 0xbbbd1489 | |
v_madak_f32_e32 v89, v84, v89, 0xbce9528f | |
v_madak_f32_e32 v89, v84, v89, 0xbea66beb | |
v_mul_f32_e32 v88, v92, v88 | |
v_madak_f32_e32 v84, v84, v89, 0x3e0375d4 | |
v_rcp_f32_e32 v88, v88 | |
v_cvt_f32_i32_e32 v89, v95 | |
v_cndmask_b32_e64 v84, v87, v84, s[8:9] | |
v_mov_b32_e32 v87, 0xbf317180 | |
v_mul_f32_e32 v84, v88, v84 | |
v_mad_f32 v88, v87, v89, v94 | |
v_mov_b32_e32 v97, 0xb717f7d1 | |
v_mad_f32 v98, v97, v89, v88 | |
v_mul_f32_e32 v99, v98, v98 | |
v_mov_b32_e32 v100, 0xb5ddea0e | |
v_mov_b32_e32 v101, 0x3331bb4c | |
v_mad_f32 v102, v101, v99, v100 | |
v_mov_b32_e32 v103, 0x388ab355 | |
v_mad_f32 v102, v102, v99, v103 | |
v_mov_b32_e32 v104, 0xbb360b61 | |
v_mad_f32 v102, v102, v99, v104 | |
v_mov_b32_e32 v105, 0x3e2aaaab | |
v_mad_f32 v102, v102, v99, v105 | |
v_mad_f32 v99, -v99, v102, v98 | |
v_mul_f32_e32 v98, v99, v98 | |
v_sub_f32_e32 v99, 2.0, v99 | |
v_cmp_gt_f32_e64 vcc, |v99|, v90 | |
v_cndmask_b32_e32 v102, 1.0, v91, vcc | |
v_mul_f32_e64 v99, v99, -v102 | |
v_rcp_f32_e32 v99, v99 | |
v_mad_f32 v70, v71, v86, -v70 | |
v_mul_f32_e32 v71, v85, v86 | |
v_mul_f32_e32 v70, v70, v71 | |
v_mul_f32_e32 v98, v99, v98 | |
v_mul_f32_e32 v98, v98, v102 | |
v_mad_f32 v89, -v89, v97, v98 | |
v_subrev_f32_e32 v88, v88, v89 | |
v_lshlrev_b32_e32 v89, 23, v95 | |
v_sub_f32_e32 v88, 1.0, v88 | |
v_add_i32_e32 v88, vcc, v88, v89 | |
v_mov_b32_e32 v89, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v89 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v88, 0, v88, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v98, 0x7f800000 | |
v_cndmask_b32_e32 v88, v98, v88, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v88, v88, v94, vcc | |
v_subrev_f32_e32 v94, v81, v93 | |
v_mul_f32_e32 v99, v84, v92 | |
v_add_f32_e32 v93, v81, v93 | |
v_mad_f32 v93, v93, v94, v99 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v96, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v84, v92, v84, 0x3f58560b | |
v_mac_f32_e32 v70, v80, v75 | |
v_mad_f32 v82, -v70, v74, v82 | |
v_cvt_f32_i32_e32 v96, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v15, v74, v70, v15 | |
v_mad_f32 v79, -v70, v73, v79 | |
v_mad_f32 v87, v87, v96, v93 | |
v_mad_f32 v102, v97, v96, v87 | |
v_mul_f32_e32 v106, v102, v102 | |
v_mac_f32_e32 v100, v101, v106 | |
v_mac_f32_e32 v103, v100, v106 | |
v_mac_f32_e32 v104, v103, v106 | |
v_mac_f32_e32 v105, v104, v106 | |
v_mad_f32 v100, -v106, v105, v102 | |
v_mul_f32_e32 v101, v100, v102 | |
v_sub_f32_e32 v100, 2.0, v100 | |
v_cmp_gt_f32_e64 vcc, |v100|, v90 | |
v_cndmask_b32_e32 v102, 1.0, v91, vcc | |
v_mul_f32_e64 v100, v100, -v102 | |
v_rcp_f32_e32 v100, v100 | |
v_mad_f32 v14, v73, v70, v14 | |
v_mad_f32 v76, -v70, v72, v76 | |
v_mad_f32 v13, v72, v70, v13 | |
v_mul_f32_e32 v100, v100, v101 | |
v_mul_f32_e32 v100, v100, v102 | |
v_mad_f32 v96, -v96, v97, v100 | |
v_subrev_f32_e32 v87, v87, v96 | |
v_sub_f32_e32 v87, 1.0, v87 | |
v_add_i32_e32 v87, vcc, v87, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v89 | |
v_cndmask_b32_e32 v87, 0, v87, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v87, v98, v87, vcc | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v87, v87, v93, vcc | |
v_mul_f32_e32 v87, v87, v88 | |
v_mov_b32_e32 v88, 0x40c00000 | |
v_cmp_gt_f32_e64 s[10:11], v88, v81 | |
v_mov_b32_e32 v88, 0x31800000 | |
v_cmp_gt_f32_e64 vcc, |v81|, v90 | |
v_cmp_gt_f32_e64 s[12:13], v88, v81 | |
v_cndmask_b32_e32 v88, 1.0, v91, vcc | |
v_mul_f32_e32 v81, v88, v81 | |
v_rcp_f32_e32 v81, v81 | |
v_cmp_u_f32_e32 vcc, v77, v77 | |
v_mac_f32_e32 v68, v0, v67 | |
v_mul_f32_e32 v81, v81, v87 | |
v_mad_f32 v81, -v88, v81, 1.0 | |
v_cndmask_b32_e64 v81, 1.0, v81, s[10:11] | |
v_cndmask_b32_e64 v81, v81, v84, s[4:5] | |
v_and_b32_e32 v84, s51, v77 | |
v_or_b32_e32 v81, v84, v81 | |
v_mad_f32 v84, v99, v77, v77 | |
v_cndmask_b32_e64 v81, v81, v84, s[8:9] | |
v_mul_f32_e32 v84, 0x3f8375d4, v77 | |
v_mac_f32_e32 v84, 0x41000000, v77 | |
v_mul_f32_e32 v84, 0x3e000000, v84 | |
v_cndmask_b32_e64 v81, v81, v84, s[12:13] | |
v_cndmask_b32_e32 v77, v81, v77, vcc | |
v_subrev_f32_e32 v77, v77, v78 | |
v_mul_f32_e64 v78, s19, -v78 | |
v_mac_f32_e32 v78, v77, v83 | |
v_mac_f32_e32 v5, v78, v75 | |
BB6_86: ; %Flow1235 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[38:39] | |
BB6_87: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v6, v76 | |
ds_write_b32 v7, v79 | |
ds_write_b32 v12, v82 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_93 | |
s_cbranch_execz BB6_93 | |
BB6_88: ; in Loop: Header=BB6_11 Depth=1 | |
v_lshlrev_b32_e32 v70, 6, v2 | |
v_add_i32_e32 v67, vcc, v11, v70 | |
v_lshlrev_b32_e32 v67, 2, v67 | |
v_add_i32_e32 v71, vcc, s15, v67 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v67, v71 | |
v_add_i32_e32 v72, vcc, 8, v11 | |
v_or_b32_e32 v73, 1, v11 | |
v_cmp_lt_i32_e32 vcc, v73, v72 | |
s_and_saveexec_b64 s[8:9], vcc | |
s_xor_b64 s[8:9], exec, s[8:9] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_90 | |
s_cbranch_execz BB6_90 | |
BB6_89: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[72:73], v71 offset0:1 offset1:2 | |
v_or_b32_e32 v76, 3, v11 | |
v_add_i32_e32 v70, vcc, v76, v70 | |
v_lshlrev_b32_e32 v70, 2, v70 | |
ds_read2_b32 v[74:75], v71 offset0:3 offset1:4 | |
v_add_i32_e32 v70, vcc, s15, v70 | |
ds_read_b32 v77, v71 offset:28 | |
ds_read2_b32 v[70:71], v70 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v67, v67, v72 | |
v_add_f32_e32 v67, v73, v67 | |
v_add_f32_e32 v67, v74, v67 | |
v_add_f32_e32 v67, v75, v67 | |
v_add_f32_e32 v67, v70, v67 | |
v_add_f32_e32 v67, v71, v67 | |
v_add_f32_e32 v67, v77, v67 | |
BB6_90: ; %._crit_edge.i72 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
v_mul_lo_i32 v66, v66, 3 | |
v_mov_b32_e32 v74, s29 | |
s_mov_b64 s[8:9], s[28:29] | |
s_mov_b64 s[10:11], s[46:47] | |
v_add_i32_e32 v70, vcc, v66, v2 | |
v_ashrrev_i32_e32 v71, 31, v70 | |
v_lshl_b64 v[72:73], v[70:71], 2 | |
v_add_i32_e32 v70, vcc, s28, v72 | |
v_addc_u32_e32 v71, vcc, v73, v74, vcc | |
buffer_load_dword v73, v[72:73], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_waitcnt vmcnt(0) | |
BB6_91: ; Parent Loop BB6_11 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v72, v67, v73 | |
v_mov_b32_e32 v75, v73 | |
v_mov_b32_e32 v74, v72 | |
buffer_atomic_cmpswap v[74:75], v[70:71], s[44:47], 0 addr64 glc | |
v_mov_b32_e32 v66, -1 | |
v_mov_b32_e32 v66, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v74, v73 | |
s_or_b64 s[8:9], vcc, s[8:9] | |
v_mov_b32_e32 v73, v74 | |
s_andn2_b64 exec, exec, s[8:9] | |
s_cbranch_execnz BB6_91 | |
; BB#92: ; %Flow1233 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
BB6_93: ; %Flow1234 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
BB6_94: ; %Flow1243 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
v_and_b32_e32 v66, 0xff0000, v69 | |
v_cmp_ne_u32_e32 vcc, 0, v66 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB6_134 | |
s_cbranch_execz BB6_134 | |
BB6_95: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v77, v54 offset:8 | |
s_mov_b64 s[8:9], s[32:33] | |
s_mov_b64 s[10:11], s[46:47] | |
v_mov_b32_e32 v76, 0 | |
v_mov_b32_e32 v82, v76 | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v66, 3, v77 | |
v_add_i32_e32 v66, vcc, v66, v1 | |
v_ashrrev_i32_e32 v67, 31, v66 | |
v_lshl_b64 v[70:71], v[66:67], 4 | |
v_lshl_b64 v[78:79], v[66:67], 3 | |
buffer_load_dwordx4 v[72:75], v[70:71], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], s[36:37] | |
buffer_load_dwordx2 v[70:71], v[78:79], s[8:11], 0 addr64 | |
v_lshrrev_b32_e32 v67, 16, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
v_mov_b32_e32 v79, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; implicit-def: %VGPR83_VGPR84_VGPR85_VGPR86 | |
s_waitcnt vmcnt(0) | |
; mask branch BB6_99 | |
s_cbranch_execz BB6_99 | |
BB6_96: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset1:1 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v41, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v81, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v78, v74, v89 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mul_f32_e32 v76, s22, v76 | |
v_cmp_lt_f32_e32 vcc, v83, v76 | |
v_mov_b32_e32 v76, 0 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_mov_b32_e32 v79, v76 | |
v_mov_b32_e32 v82, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[38:39], exec, s[4:5] | |
; implicit-def: %VGPR83_VGPR84_VGPR85_VGPR86 | |
; mask branch BB6_98 | |
s_cbranch_execz BB6_98 | |
BB6_97: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v79, 0x34cd15ae, v83 | |
v_mul_f32_e32 v82, v9, v79 | |
v_mul_f32_e32 v83, v82, v82 | |
v_mov_b32_e32 v84, 0x3a92b707 | |
v_madak_f32_e32 v84, v84, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v85, 0x3c739487 | |
v_madak_f32_e32 v85, v85, v83, 0x3f01e2bc | |
v_mad_f32 v84, v84, v83, 1.0 | |
v_mac_f32_e32 v84, v82, v85 | |
v_mov_b32_e32 v85, 0xb2951928 | |
v_madak_f32_e32 v85, v85, v83, 0xb85ffb93 | |
v_mov_b32_e32 v86, 0x35c55945 | |
v_madak_f32_e32 v86, v86, v83, 0x3a83ca0c | |
v_madak_f32_e32 v85, v85, v83, 0xbc9ded90 | |
v_madak_f32_e32 v86, v86, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v85, v85, v83, 0xbf409397 | |
v_mac_f32_e32 v85, v82, v86 | |
v_rsq_f32_e32 v86, v79 | |
v_mul_f32_e32 v79, s18, v79 | |
v_mov_b32_e32 v88, 0x3fa00000 | |
v_mul_f32_e32 v76, v75, v90 | |
v_mul_f32_e32 v79, v86, v79 | |
v_and_b32_e32 v87, s27, v79 | |
v_cmp_gt_f32_e64 s[4:5], v88, v87 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v89, v88 | |
v_add_f32_e32 v90, -1.0, v87 | |
v_mov_b32_e32 v91, 0xbd777f97 | |
v_lshrrev_b32_e32 v82, 16, v65 | |
v_cndmask_b32_e64 v89, v89, v90, s[4:5] | |
v_mov_b32_e32 v90, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v90, v87 | |
v_cndmask_b32_e64 v88, v89, v88, s[8:9] | |
v_mov_b32_e32 v90, 0xc11d077e | |
v_mov_b32_e32 v89, 0x4036db6e | |
v_madak_f32_e32 v90, v90, v88, 0xc2a2932b | |
v_cmp_gt_f32_e64 s[10:11], v89, v87 | |
v_mov_b32_e32 v89, 0xc3f1c275 | |
v_madak_f32_e32 v89, v89, v88, 0xc480230b | |
v_madak_f32_e32 v90, v88, v90, 0xc3389ae7 | |
v_madak_f32_e32 v89, v88, v89, 0xc41f6441 | |
v_madak_f32_e32 v90, v88, v90, 0xc322658c | |
v_madak_f32_e32 v89, v88, v89, 0xc320a2ea | |
v_madak_f32_e32 v90, v88, v90, 0xc2798057 | |
v_madak_f32_e32 v89, v88, v89, 0xc18e104b | |
v_madak_f32_e32 v90, v88, v90, 0xc128f022 | |
v_madak_f32_e32 v89, v88, v89, 0xbf4c9dd4 | |
v_madak_f32_e32 v90, v88, v90, 0xbf31a0b7 | |
v_madak_f32_e32 v89, v88, v89, 0xbc21a092 | |
v_madak_f32_e32 v90, v88, v90, 0xbc21a093 | |
v_madak_f32_e32 v91, v91, v88, 0x40d23f7c | |
v_cndmask_b32_e64 v89, v89, v90, s[10:11] | |
v_mov_b32_e32 v90, 0xc1b38712 | |
v_madak_f32_e32 v90, v90, v88, 0x43ed43a7 | |
v_madak_f32_e32 v91, v88, v91, 0x42d9451f | |
v_madak_f32_e32 v90, v88, v90, 0x451f90ce | |
v_madak_f32_e32 v91, v88, v91, 0x43d6810b | |
v_madak_f32_e32 v90, v88, v90, 0x4547fdbb | |
v_madak_f32_e32 v91, v88, v91, 0x442158c9 | |
v_madak_f32_e32 v90, v88, v90, 0x44c01759 | |
v_madak_f32_e32 v91, v88, v91, 0x43d9486f | |
v_madak_f32_e32 v90, v88, v90, 0x43a2e571 | |
v_madak_f32_e32 v91, v88, v91, 0x4309a863 | |
v_madak_f32_e32 v90, v88, v90, 0x41f2b459 | |
v_madak_f32_e32 v91, v88, v91, 0x419d35ce | |
v_cndmask_b32_e64 v90, v90, v91, s[10:11] | |
v_mov_b32_e32 v91, 0xbb0df9c0 | |
v_madak_f32_e32 v91, v91, v88, 0x3d1151b3 | |
v_madak_f32_e32 v91, v88, v91, 0xbde31cc2 | |
v_madak_f32_e32 v91, v88, v91, 0x3ea2fe54 | |
v_madak_f32_e32 v91, v88, v91, 0xbebe9208 | |
v_madak_f32_e32 v91, v88, v91, 0x3ed46805 | |
v_madak_f32_e32 v91, v88, v91, 0xbb1acdc6 | |
v_cndmask_b32_e64 v89, v89, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3c445aa3 | |
v_madak_f32_e32 v91, v91, v88, 0x3c5f6e13 | |
v_madak_f32_e32 v91, v88, v91, 0x3e013307 | |
v_madak_f32_e32 v91, v88, v91, 0x3d931ae7 | |
v_madak_f32_e32 v91, v88, v91, 0x3f0a5785 | |
v_madak_f32_e32 v91, v88, v91, 0x3dd9f331 | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0xb7c756b1 | |
v_madak_f32_e32 v91, v91, v88, 0xbbbd1489 | |
v_madak_f32_e32 v91, v88, v91, 0xbce9528f | |
v_madak_f32_e32 v91, v88, v91, 0xbea66beb | |
v_madak_f32_e32 v91, v88, v91, 0x3e0375d4 | |
v_cndmask_b32_e64 v89, v89, v91, s[8:9] | |
v_mov_b32_e32 v91, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v88, 0x390aee49 | |
v_madak_f32_e32 v91, v88, v91, 0x3ba68116 | |
v_madak_f32_e32 v91, v88, v91, 0x3d852a63 | |
v_madak_f32_e32 v91, v88, v91, 0x3ecbbbce | |
v_cndmask_b32_e64 v90, v90, v91, s[8:9] | |
v_mad_f32 v88, v88, v90, 1.0 | |
v_mov_b32_e32 v90, 0x6f800000 | |
v_cmp_gt_f32_e64 s[10:11], |v88|, v90 | |
v_mov_b32_e32 v91, 0x2f800000 | |
v_cndmask_b32_e64 v92, 1.0, v91, s[10:11] | |
v_mul_f32_e32 v88, v92, v88 | |
v_rcp_f32_e32 v88, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_and_b32_e32 v97, s50, v79 | |
v_mov_b32_e32 v98, 0xbf100000 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mad_f32 v98, v97, -v97, v98 | |
v_mul_f32_e32 v88, v88, v89 | |
v_cndmask_b32_e64 v89, 0, 1.0, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v98 | |
v_cndmask_b32_e64 v99, 0.5, -0.5, vcc | |
v_mov_b32_e32 v100, 0x3fb8aa3b | |
v_mac_f32_e32 v99, v100, v98 | |
v_cvt_i32_f32_e32 v99, v99 | |
v_mov_b32_e32 v103, 0xbf317180 | |
v_mov_b32_e32 v105, 0xb717f7d1 | |
v_subrev_f32_e32 v114, v87, v97 | |
v_cvt_f32_i32_e32 v101, v99 | |
v_mul_f32_e32 v102, v88, v92 | |
v_add_f32_e32 v97, v87, v97 | |
v_mad_f32 v97, v97, v114, v102 | |
v_mad_f32 v104, v103, v101, v98 | |
v_mad_f32 v106, v105, v101, v104 | |
v_mul_f32_e32 v107, v106, v106 | |
v_mov_b32_e32 v108, 0xb5ddea0e | |
v_mov_b32_e32 v109, 0x3331bb4c | |
v_cmp_gt_f32_e64 s[10:11], 0, v97 | |
v_mad_f32 v110, v109, v107, v108 | |
v_mov_b32_e32 v111, 0x388ab355 | |
v_cndmask_b32_e64 v114, 0.5, -0.5, s[10:11] | |
v_mad_f32 v110, v110, v107, v111 | |
v_mov_b32_e32 v112, 0xbb360b61 | |
v_mac_f32_e32 v114, v100, v97 | |
v_mad_f32 v110, v110, v107, v112 | |
v_mov_b32_e32 v113, 0x3e2aaaab | |
v_mad_f32 v110, v110, v107, v113 | |
v_cvt_i32_f32_e32 v100, v114 | |
v_mad_f32 v107, -v107, v110, v106 | |
v_sub_f32_e32 v110, 2.0, v107 | |
v_cmp_gt_f32_e64 vcc, |v110|, v90 | |
v_cvt_f32_i32_e32 v115, v100 | |
v_cndmask_b32_e32 v114, 1.0, v91, vcc | |
v_mul_f32_e64 v110, v110, -v114 | |
v_rcp_f32_e32 v110, v110 | |
v_mad_f32 v103, v103, v115, v97 | |
v_mul_f32_e32 v106, v107, v106 | |
v_mad_f32 v107, v105, v115, v103 | |
v_mul_f32_e32 v106, v110, v106 | |
v_mul_f32_e32 v110, v107, v107 | |
v_mac_f32_e32 v108, v109, v110 | |
v_mac_f32_e32 v111, v108, v110 | |
s_mov_b32 m0, -1 | |
v_mac_f32_e32 v112, v111, v110 | |
ds_read_b64 v[82:83], v56 | |
v_mac_f32_e32 v113, v112, v110 | |
v_rcp_f32_e32 v84, v84 | |
v_mul_f32_e32 v93, v86, v86 | |
v_mad_f32 v108, -v110, v113, v107 | |
v_mul_f32_e32 v94, v93, v93 | |
v_sub_f32_e32 v109, 2.0, v108 | |
v_mul_f32_e32 v94, v89, v94 | |
v_cmp_gt_f32_e64 vcc, |v109|, v90 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_cndmask_b32_e32 v110, 1.0, v91, vcc | |
v_mul_f32_e64 v109, v109, -v110 | |
v_mul_f32_e32 v84, v37, v84 | |
v_mul_f32_e32 v106, v106, v114 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v70, v82 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mul_f32_e32 v83, v71, v83 | |
v_mad_f32 v96, v95, v95, s26 | |
v_mad_f32 v101, -v101, v105, v106 | |
v_mul_f32_e32 v84, v85, v84 | |
v_mul_f32_e32 v85, v89, v93 | |
v_mul_f32_e32 v96, v96, v83 | |
v_mul_f32_e32 v94, v82, v94 | |
v_rcp_f32_e32 v109, v109 | |
v_mac_f32_e32 v94, 0x3daaaaaa, v96 | |
v_mac_f32_e32 v84, v86, v85 | |
v_subrev_f32_e32 v85, v104, v101 | |
v_mac_f32_e32 v8, v89, v94 | |
v_lshlrev_b32_e32 v94, 23, v99 | |
v_sub_f32_e32 v85, 1.0, v85 | |
v_mul_f32_e32 v106, v108, v107 | |
v_add_i32_e32 v85, vcc, v85, v94 | |
v_mov_b32_e32 v94, 0xc2aeac4f | |
v_mul_f32_e32 v106, v109, v106 | |
v_cmp_nlt_f32_e32 vcc, v98, v94 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_mul_f32_e32 v106, v106, v110 | |
v_cndmask_b32_e32 v85, 0, v85, vcc | |
v_cmp_lt_f32_e32 vcc, v98, v96 | |
v_mov_b32_e32 v99, 0x7f800000 | |
v_cndmask_b32_e32 v85, v99, v85, vcc | |
v_cmp_u_f32_e32 vcc, v98, v98 | |
v_mad_f32 v105, -v115, v105, v106 | |
v_cndmask_b32_e32 v85, v85, v98, vcc | |
v_subrev_f32_e32 v98, v103, v105 | |
v_sub_f32_e32 v98, 1.0, v98 | |
v_lshlrev_b32_e32 v100, 23, v100 | |
v_add_i32_e32 v98, vcc, v98, v100 | |
v_cmp_nlt_f32_e32 vcc, v97, v94 | |
v_cndmask_b32_e32 v94, 0, v98, vcc | |
v_cmp_lt_f32_e32 vcc, v97, v96 | |
v_cndmask_b32_e32 v94, v99, v94, vcc | |
v_cmp_u_f32_e32 vcc, v97, v97 | |
v_cndmask_b32_e32 v94, v94, v97, vcc | |
v_cmp_gt_f32_e64 vcc, |v87|, v90 | |
v_mov_b32_e32 v90, 0x40c00000 | |
v_cmp_gt_f32_e64 s[10:11], v90, v87 | |
v_mov_b32_e32 v90, 0x31800000 | |
v_cmp_gt_f32_e64 s[12:13], v90, v87 | |
v_cndmask_b32_e32 v90, 1.0, v91, vcc | |
v_mul_f32_e32 v87, v90, v87 | |
v_rcp_f32_e32 v87, v87 | |
v_mul_f32_e32 v85, v94, v85 | |
v_cmp_u_f32_e32 vcc, v79, v79 | |
v_mac_f32_e32 v51, v0, v67 | |
v_mul_f32_e32 v85, v87, v85 | |
v_mad_f32 v85, -v90, v85, 1.0 | |
v_madak_f32_e32 v87, v92, v88, 0x3f58560b | |
v_cndmask_b32_e64 v85, 1.0, v85, s[10:11] | |
v_cndmask_b32_e64 v85, v85, v87, s[4:5] | |
v_and_b32_e32 v87, s51, v79 | |
v_or_b32_e32 v85, v87, v85 | |
v_mad_f32 v87, v102, v79, v79 | |
v_cndmask_b32_e64 v85, v85, v87, s[8:9] | |
v_mul_f32_e32 v87, 0x3f8375d4, v79 | |
v_mac_f32_e32 v87, 0x41000000, v79 | |
v_mul_f32_e32 v87, 0x3e000000, v87 | |
v_cndmask_b32_e64 v85, v85, v87, s[12:13] | |
v_cndmask_b32_e32 v79, v85, v79, vcc | |
v_subrev_f32_e32 v79, v79, v89 | |
v_mul_f32_e64 v85, s19, -v89 | |
v_mac_f32_e32 v85, v79, v86 | |
v_mad_f32 v79, v83, v95, -v82 | |
v_mul_f32_e32 v82, v93, v95 | |
v_mul_f32_e32 v83, v79, v82 | |
v_mac_f32_e32 v83, v84, v76 | |
v_mac_f32_e32 v5, v85, v76 | |
v_mad_f32 v50, v78, v83, v50 | |
v_mad_f32 v49, v80, v83, v49 | |
v_mad_f32 v48, v81, v83, v48 | |
v_mul_f32_e64 v82, v83, -v78 | |
v_mul_f32_e64 v79, v83, -v80 | |
v_mul_f32_e64 v76, v83, -v81 | |
v_mul_f32_e64 v86, v67, -v0 | |
BB6_98: ; %Flow1231 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[38:39] | |
BB6_99: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
v_lshrrev_b32_e32 v67, 17, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_103 | |
s_cbranch_execz BB6_103 | |
BB6_100: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:16 offset1:17 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v64, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_102 | |
s_cbranch_execz BB6_102 | |
BB6_101: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 17, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:64 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v44, v81, v96, v44 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v43, v80, v96, v43 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v42, v78, v96, v42 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v45, v0, v67 | |
BB6_102: ; %Flow1230 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_103: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 18, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_107 | |
s_cbranch_execz BB6_107 | |
BB6_104: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:32 offset1:33 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v63, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_106 | |
s_cbranch_execz BB6_106 | |
BB6_105: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 18, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:128 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v34, v81, v96, v34 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v33, v80, v96, v33 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v32, v78, v96, v32 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v35, v0, v67 | |
BB6_106: ; %Flow1229 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_107: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 19, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_111 | |
s_cbranch_execz BB6_111 | |
BB6_108: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:48 offset1:49 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v62, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_110 | |
s_cbranch_execz BB6_110 | |
BB6_109: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 19, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:192 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v30, v81, v96, v30 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v29, v80, v96, v29 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v28, v78, v96, v28 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v31, v0, v67 | |
BB6_110: ; %Flow1228 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_111: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 20, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_115 | |
s_cbranch_execz BB6_115 | |
BB6_112: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:64 offset1:65 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v61, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_114 | |
s_cbranch_execz BB6_114 | |
BB6_113: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 20, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:256 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v26, v81, v96, v26 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v25, v80, v96, v25 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v24, v78, v96, v24 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v27, v0, v67 | |
BB6_114: ; %Flow1227 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_115: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 21, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_119 | |
s_cbranch_execz BB6_119 | |
BB6_116: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:80 offset1:81 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v60, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_118 | |
s_cbranch_execz BB6_118 | |
BB6_117: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 21, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:320 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v22, v81, v96, v22 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v21, v80, v96, v21 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v20, v78, v96, v20 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v23, v0, v67 | |
BB6_118: ; %Flow1226 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_119: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 22, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_123 | |
s_cbranch_execz BB6_123 | |
BB6_120: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:96 offset1:97 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v59, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_122 | |
s_cbranch_execz BB6_122 | |
BB6_121: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 22, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:384 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v18, v81, v96, v18 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v17, v80, v96, v17 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v96, v78, v76 | |
v_mad_f32 v16, v78, v96, v16 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v19, v0, v67 | |
BB6_122: ; %Flow1225 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_123: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 23, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_127 | |
s_cbranch_execz BB6_127 | |
BB6_124: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:112 offset1:113 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v38, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v73, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mul_f32_e32 v77, v73, v73 | |
v_cndmask_b32_e64 v78, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v74, v74, v89 | |
v_mac_f32_e32 v77, v72, v72 | |
v_mac_f32_e32 v77, v74, v74 | |
v_mul_f32_e32 v78, s22, v78 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v77, v78 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[38:39], exec, s[4:5] | |
; mask branch BB6_126 | |
s_cbranch_execz BB6_126 | |
BB6_125: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b64 v[80:81], v56 offset:448 | |
v_max_f32_e32 v77, 0x34cd15ae, v77 | |
v_mul_f32_e32 v84, v9, v77 | |
v_mad_f32 v83, -v67, v0, v86 | |
v_mov_b32_e32 v85, 0x3c739487 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v70, v70, v80 | |
v_mul_f32_e32 v71, v71, v81 | |
v_mul_f32_e32 v80, v84, v84 | |
v_mov_b32_e32 v81, 0x3a92b707 | |
v_madak_f32_e32 v81, v81, v80, 0x3ded3cb2 | |
v_rsq_f32_e32 v83, v77 | |
v_madak_f32_e32 v85, v85, v80, 0x3f01e2bc | |
v_mad_f32 v81, v81, v80, 1.0 | |
v_mac_f32_e32 v81, v84, v85 | |
v_mov_b32_e32 v85, 0xb2951928 | |
v_lshrrev_b32_e32 v78, 23, v65 | |
v_madak_f32_e32 v85, v85, v80, 0xb85ffb93 | |
v_mov_b32_e32 v86, 0x35c55945 | |
v_madak_f32_e32 v86, v86, v80, 0x3a83ca0c | |
v_and_b32_e32 v78, 1, v78 | |
v_madak_f32_e32 v85, v85, v80, 0xbc9ded90 | |
v_cmp_eq_u32_e32 vcc, 1, v78 | |
v_madak_f32_e32 v86, v86, v80, 0x3d8eaf3b | |
v_madak_f32_e32 v80, v85, v80, 0xbf409397 | |
v_mul_f32_e32 v85, v83, v83 | |
v_mac_f32_e32 v80, v84, v86 | |
v_cndmask_b32_e64 v78, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v85, v85 | |
v_rcp_f32_e32 v81, v81 | |
v_mul_f32_e32 v84, v78, v84 | |
v_mul_f32_e32 v86, v85, v84 | |
v_mad_f32 v84, v84, v85, s23 | |
v_mad_f32 v87, v86, v86, s26 | |
v_mul_f32_e32 v84, 0xbe2aaaab, v84 | |
v_mul_f32_e32 v84, v70, v84 | |
v_mul_f32_e32 v87, v87, v71 | |
v_mul_f32_e32 v77, s18, v77 | |
v_mac_f32_e32 v84, 0x3daaaaaa, v87 | |
v_mul_f32_e32 v81, v37, v81 | |
v_mac_f32_e32 v8, v78, v84 | |
v_mul_f32_e32 v77, v83, v77 | |
v_mul_f32_e32 v80, v80, v81 | |
v_mul_f32_e32 v84, v78, v85 | |
v_mac_f32_e32 v80, v83, v84 | |
v_and_b32_e32 v81, s27, v77 | |
v_mov_b32_e32 v84, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v84, v81 | |
v_mul_f32_e32 v84, v81, v81 | |
v_rcp_f32_e32 v87, v84 | |
v_add_f32_e32 v88, -1.0, v81 | |
v_mov_b32_e32 v89, 0xbd777f97 | |
v_mul_f32_e32 v75, v75, v90 | |
v_cndmask_b32_e64 v87, v87, v88, s[4:5] | |
v_mov_b32_e32 v88, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v88, v81 | |
v_cndmask_b32_e64 v84, v87, v84, s[8:9] | |
v_mov_b32_e32 v88, 0xc11d077e | |
v_mov_b32_e32 v87, 0x4036db6e | |
v_madak_f32_e32 v88, v88, v84, 0xc2a2932b | |
v_cmp_gt_f32_e32 vcc, v87, v81 | |
v_mov_b32_e32 v87, 0xc3f1c275 | |
v_madak_f32_e32 v87, v87, v84, 0xc480230b | |
v_madak_f32_e32 v88, v84, v88, 0xc3389ae7 | |
v_madak_f32_e32 v87, v84, v87, 0xc41f6441 | |
v_madak_f32_e32 v88, v84, v88, 0xc322658c | |
v_madak_f32_e32 v87, v84, v87, 0xc320a2ea | |
v_madak_f32_e32 v88, v84, v88, 0xc2798057 | |
v_madak_f32_e32 v87, v84, v87, 0xc18e104b | |
v_madak_f32_e32 v88, v84, v88, 0xc128f022 | |
v_madak_f32_e32 v87, v84, v87, 0xbf4c9dd4 | |
v_madak_f32_e32 v88, v84, v88, 0xbf31a0b7 | |
v_madak_f32_e32 v87, v84, v87, 0xbc21a092 | |
v_madak_f32_e32 v88, v84, v88, 0xbc21a093 | |
v_madak_f32_e32 v89, v89, v84, 0x40d23f7c | |
v_cndmask_b32_e32 v87, v87, v88, vcc | |
v_mov_b32_e32 v88, 0xc1b38712 | |
v_madak_f32_e32 v88, v88, v84, 0x43ed43a7 | |
v_madak_f32_e32 v89, v84, v89, 0x42d9451f | |
v_madak_f32_e32 v88, v84, v88, 0x451f90ce | |
v_madak_f32_e32 v89, v84, v89, 0x43d6810b | |
v_madak_f32_e32 v88, v84, v88, 0x4547fdbb | |
v_madak_f32_e32 v89, v84, v89, 0x442158c9 | |
v_madak_f32_e32 v88, v84, v88, 0x44c01759 | |
v_madak_f32_e32 v89, v84, v89, 0x43d9486f | |
v_madak_f32_e32 v88, v84, v88, 0x43a2e571 | |
v_madak_f32_e32 v89, v84, v89, 0x4309a863 | |
v_madak_f32_e32 v88, v84, v88, 0x41f2b459 | |
v_madak_f32_e32 v89, v84, v89, 0x419d35ce | |
v_cndmask_b32_e32 v88, v88, v89, vcc | |
v_mov_b32_e32 v89, 0xbb0df9c0 | |
v_madak_f32_e32 v89, v89, v84, 0x3d1151b3 | |
v_madak_f32_e32 v89, v84, v89, 0xbde31cc2 | |
v_madak_f32_e32 v89, v84, v89, 0x3ea2fe54 | |
v_madak_f32_e32 v89, v84, v89, 0xbebe9208 | |
v_madak_f32_e32 v89, v84, v89, 0x3ed46805 | |
v_madak_f32_e32 v89, v84, v89, 0xbb1acdc6 | |
v_cndmask_b32_e64 v87, v87, v89, s[4:5] | |
v_mov_b32_e32 v89, 0x3c445aa3 | |
v_madak_f32_e32 v89, v89, v84, 0x3c5f6e13 | |
v_madak_f32_e32 v89, v84, v89, 0x3e013307 | |
v_madak_f32_e32 v89, v84, v89, 0x3d931ae7 | |
v_madak_f32_e32 v89, v84, v89, 0x3f0a5785 | |
v_madak_f32_e32 v89, v84, v89, 0x3dd9f331 | |
v_cndmask_b32_e64 v88, v88, v89, s[4:5] | |
v_mov_b32_e32 v89, 0xb684e21a | |
v_madak_f32_e32 v89, v89, v84, 0x390aee49 | |
v_madak_f32_e32 v89, v84, v89, 0x3ba68116 | |
v_madak_f32_e32 v89, v84, v89, 0x3d852a63 | |
v_madak_f32_e32 v89, v84, v89, 0x3ecbbbce | |
v_cndmask_b32_e64 v88, v88, v89, s[8:9] | |
v_mad_f32 v88, v84, v88, 1.0 | |
v_mov_b32_e32 v90, 0x6f800000 | |
v_and_b32_e32 v93, s50, v77 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_cmp_gt_f32_e64 vcc, |v88|, v90 | |
v_mov_b32_e32 v91, 0x2f800000 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v92, 1.0, v91, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v96, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v96, v94 | |
v_mov_b32_e32 v89, 0xb7c756b1 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v89, v89, v84, 0xbbbd1489 | |
v_madak_f32_e32 v89, v84, v89, 0xbce9528f | |
v_madak_f32_e32 v89, v84, v89, 0xbea66beb | |
v_mul_f32_e32 v88, v92, v88 | |
v_madak_f32_e32 v84, v84, v89, 0x3e0375d4 | |
v_rcp_f32_e32 v88, v88 | |
v_cvt_f32_i32_e32 v89, v95 | |
v_cndmask_b32_e64 v84, v87, v84, s[8:9] | |
v_mov_b32_e32 v87, 0xbf317180 | |
v_mul_f32_e32 v84, v88, v84 | |
v_mad_f32 v88, v87, v89, v94 | |
v_mov_b32_e32 v97, 0xb717f7d1 | |
v_mad_f32 v98, v97, v89, v88 | |
v_mul_f32_e32 v99, v98, v98 | |
v_mov_b32_e32 v100, 0xb5ddea0e | |
v_mov_b32_e32 v101, 0x3331bb4c | |
v_mad_f32 v102, v101, v99, v100 | |
v_mov_b32_e32 v103, 0x388ab355 | |
v_mad_f32 v102, v102, v99, v103 | |
v_mov_b32_e32 v104, 0xbb360b61 | |
v_mad_f32 v102, v102, v99, v104 | |
v_mov_b32_e32 v105, 0x3e2aaaab | |
v_mad_f32 v102, v102, v99, v105 | |
v_mad_f32 v99, -v99, v102, v98 | |
v_mul_f32_e32 v98, v99, v98 | |
v_sub_f32_e32 v99, 2.0, v99 | |
v_cmp_gt_f32_e64 vcc, |v99|, v90 | |
v_cndmask_b32_e32 v102, 1.0, v91, vcc | |
v_mul_f32_e64 v99, v99, -v102 | |
v_rcp_f32_e32 v99, v99 | |
v_mad_f32 v70, v71, v86, -v70 | |
v_mul_f32_e32 v71, v85, v86 | |
v_mul_f32_e32 v70, v70, v71 | |
v_mul_f32_e32 v98, v99, v98 | |
v_mul_f32_e32 v98, v98, v102 | |
v_mad_f32 v89, -v89, v97, v98 | |
v_subrev_f32_e32 v88, v88, v89 | |
v_lshlrev_b32_e32 v89, 23, v95 | |
v_sub_f32_e32 v88, 1.0, v88 | |
v_add_i32_e32 v88, vcc, v88, v89 | |
v_mov_b32_e32 v89, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v89 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v88, 0, v88, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v98, 0x7f800000 | |
v_cndmask_b32_e32 v88, v98, v88, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v88, v88, v94, vcc | |
v_subrev_f32_e32 v94, v81, v93 | |
v_mul_f32_e32 v99, v84, v92 | |
v_add_f32_e32 v93, v81, v93 | |
v_mad_f32 v93, v93, v94, v99 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v96, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v84, v92, v84, 0x3f58560b | |
v_mac_f32_e32 v70, v80, v75 | |
v_mad_f32 v82, -v70, v74, v82 | |
v_cvt_f32_i32_e32 v96, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v15, v74, v70, v15 | |
v_mad_f32 v79, -v70, v73, v79 | |
v_mad_f32 v87, v87, v96, v93 | |
v_mad_f32 v102, v97, v96, v87 | |
v_mul_f32_e32 v106, v102, v102 | |
v_mac_f32_e32 v100, v101, v106 | |
v_mac_f32_e32 v103, v100, v106 | |
v_mac_f32_e32 v104, v103, v106 | |
v_mac_f32_e32 v105, v104, v106 | |
v_mad_f32 v100, -v106, v105, v102 | |
v_mul_f32_e32 v101, v100, v102 | |
v_sub_f32_e32 v100, 2.0, v100 | |
v_cmp_gt_f32_e64 vcc, |v100|, v90 | |
v_cndmask_b32_e32 v102, 1.0, v91, vcc | |
v_mul_f32_e64 v100, v100, -v102 | |
v_rcp_f32_e32 v100, v100 | |
v_mad_f32 v14, v73, v70, v14 | |
v_mad_f32 v76, -v70, v72, v76 | |
v_mad_f32 v13, v72, v70, v13 | |
v_mul_f32_e32 v100, v100, v101 | |
v_mul_f32_e32 v100, v100, v102 | |
v_mad_f32 v96, -v96, v97, v100 | |
v_subrev_f32_e32 v87, v87, v96 | |
v_sub_f32_e32 v87, 1.0, v87 | |
v_add_i32_e32 v87, vcc, v87, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v89 | |
v_cndmask_b32_e32 v87, 0, v87, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v87, v98, v87, vcc | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v87, v87, v93, vcc | |
v_mul_f32_e32 v87, v87, v88 | |
v_mov_b32_e32 v88, 0x40c00000 | |
v_cmp_gt_f32_e64 s[10:11], v88, v81 | |
v_mov_b32_e32 v88, 0x31800000 | |
v_cmp_gt_f32_e64 vcc, |v81|, v90 | |
v_cmp_gt_f32_e64 s[12:13], v88, v81 | |
v_cndmask_b32_e32 v88, 1.0, v91, vcc | |
v_mul_f32_e32 v81, v88, v81 | |
v_rcp_f32_e32 v81, v81 | |
v_cmp_u_f32_e32 vcc, v77, v77 | |
v_mac_f32_e32 v68, v0, v67 | |
v_mul_f32_e32 v81, v81, v87 | |
v_mad_f32 v81, -v88, v81, 1.0 | |
v_cndmask_b32_e64 v81, 1.0, v81, s[10:11] | |
v_cndmask_b32_e64 v81, v81, v84, s[4:5] | |
v_and_b32_e32 v84, s51, v77 | |
v_or_b32_e32 v81, v84, v81 | |
v_mad_f32 v84, v99, v77, v77 | |
v_cndmask_b32_e64 v81, v81, v84, s[8:9] | |
v_mul_f32_e32 v84, 0x3f8375d4, v77 | |
v_mac_f32_e32 v84, 0x41000000, v77 | |
v_mul_f32_e32 v84, 0x3e000000, v84 | |
v_cndmask_b32_e64 v81, v81, v84, s[12:13] | |
v_cndmask_b32_e32 v77, v81, v77, vcc | |
v_subrev_f32_e32 v77, v77, v78 | |
v_mul_f32_e64 v78, s19, -v78 | |
v_mac_f32_e32 v78, v77, v83 | |
v_mac_f32_e32 v5, v78, v75 | |
BB6_126: ; %Flow1224 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[38:39] | |
BB6_127: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v6, v76 | |
ds_write_b32 v7, v79 | |
ds_write_b32 v12, v82 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_133 | |
s_cbranch_execz BB6_133 | |
BB6_128: ; in Loop: Header=BB6_11 Depth=1 | |
v_lshlrev_b32_e32 v70, 6, v2 | |
v_add_i32_e32 v67, vcc, v11, v70 | |
v_lshlrev_b32_e32 v67, 2, v67 | |
v_add_i32_e32 v71, vcc, s15, v67 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v67, v71 | |
v_add_i32_e32 v72, vcc, 8, v11 | |
v_or_b32_e32 v73, 1, v11 | |
v_cmp_lt_i32_e32 vcc, v73, v72 | |
s_and_saveexec_b64 s[8:9], vcc | |
s_xor_b64 s[8:9], exec, s[8:9] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_130 | |
s_cbranch_execz BB6_130 | |
BB6_129: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[72:73], v71 offset0:1 offset1:2 | |
v_or_b32_e32 v76, 3, v11 | |
v_add_i32_e32 v70, vcc, v76, v70 | |
v_lshlrev_b32_e32 v70, 2, v70 | |
ds_read2_b32 v[74:75], v71 offset0:3 offset1:4 | |
v_add_i32_e32 v70, vcc, s15, v70 | |
ds_read_b32 v77, v71 offset:28 | |
ds_read2_b32 v[70:71], v70 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v67, v67, v72 | |
v_add_f32_e32 v67, v73, v67 | |
v_add_f32_e32 v67, v74, v67 | |
v_add_f32_e32 v67, v75, v67 | |
v_add_f32_e32 v67, v70, v67 | |
v_add_f32_e32 v67, v71, v67 | |
v_add_f32_e32 v67, v77, v67 | |
BB6_130: ; %._crit_edge.i26 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
v_mul_lo_i32 v66, v66, 3 | |
v_mov_b32_e32 v74, s29 | |
s_mov_b64 s[8:9], s[28:29] | |
s_mov_b64 s[10:11], s[46:47] | |
v_add_i32_e32 v70, vcc, v66, v2 | |
v_ashrrev_i32_e32 v71, 31, v70 | |
v_lshl_b64 v[72:73], v[70:71], 2 | |
v_add_i32_e32 v70, vcc, s28, v72 | |
v_addc_u32_e32 v71, vcc, v73, v74, vcc | |
buffer_load_dword v73, v[72:73], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_waitcnt vmcnt(0) | |
BB6_131: ; Parent Loop BB6_11 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v72, v67, v73 | |
v_mov_b32_e32 v75, v73 | |
v_mov_b32_e32 v74, v72 | |
buffer_atomic_cmpswap v[74:75], v[70:71], s[44:47], 0 addr64 glc | |
v_mov_b32_e32 v66, -1 | |
v_mov_b32_e32 v66, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v74, v73 | |
s_or_b64 s[8:9], vcc, s[8:9] | |
v_mov_b32_e32 v73, v74 | |
s_andn2_b64 exec, exec, s[8:9] | |
s_cbranch_execnz BB6_131 | |
; BB#132: ; %Flow1222 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
BB6_133: ; %Flow1223 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
BB6_134: ; %Flow1232 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
v_mov_b32_e32 v66, 0xffffff | |
v_cmp_lt_u32_e32 vcc, v66, v69 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB6_174 | |
s_cbranch_execz BB6_174 | |
BB6_135: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v76, v54 offset:12 | |
s_mov_b64 s[8:9], s[32:33] | |
s_mov_b64 s[10:11], s[46:47] | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v66, 3, v76 | |
v_add_i32_e32 v66, vcc, v66, v1 | |
v_ashrrev_i32_e32 v67, 31, v66 | |
v_lshl_b64 v[70:71], v[66:67], 4 | |
v_lshl_b64 v[77:78], v[66:67], 3 | |
buffer_load_dwordx4 v[72:75], v[70:71], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], s[36:37] | |
buffer_load_dwordx2 v[70:71], v[77:78], s[8:11], 0 addr64 | |
v_lshrrev_b32_e32 v67, 24, v69 | |
v_mov_b32_e32 v77, 0 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
v_mov_b32_e32 v79, v77 | |
v_mov_b32_e32 v82, v77 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; implicit-def: %VGPR83_VGPR84_VGPR85_VGPR86 | |
s_waitcnt vmcnt(0) | |
; mask branch BB6_139 | |
s_cbranch_execz BB6_139 | |
BB6_136: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset1:1 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v41, v76 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v81, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v77, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v78, v74, v89 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mul_f32_e32 v77, s22, v77 | |
v_cmp_lt_f32_e32 vcc, v83, v77 | |
v_mov_b32_e32 v77, 0 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_mov_b32_e32 v79, v77 | |
v_mov_b32_e32 v82, v77 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[38:39], exec, s[4:5] | |
; implicit-def: %VGPR83_VGPR84_VGPR85_VGPR86 | |
; mask branch BB6_138 | |
s_cbranch_execz BB6_138 | |
BB6_137: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v79, 0x34cd15ae, v83 | |
v_mul_f32_e32 v82, v9, v79 | |
v_mul_f32_e32 v83, v82, v82 | |
v_mov_b32_e32 v84, 0x3a92b707 | |
v_madak_f32_e32 v84, v84, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v85, 0x3c739487 | |
v_madak_f32_e32 v85, v85, v83, 0x3f01e2bc | |
v_mad_f32 v84, v84, v83, 1.0 | |
v_mac_f32_e32 v84, v82, v85 | |
v_mov_b32_e32 v85, 0xb2951928 | |
v_madak_f32_e32 v85, v85, v83, 0xb85ffb93 | |
v_mov_b32_e32 v86, 0x35c55945 | |
v_madak_f32_e32 v86, v86, v83, 0x3a83ca0c | |
v_madak_f32_e32 v85, v85, v83, 0xbc9ded90 | |
v_madak_f32_e32 v86, v86, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v85, v85, v83, 0xbf409397 | |
v_mac_f32_e32 v85, v82, v86 | |
v_rsq_f32_e32 v86, v79 | |
v_mul_f32_e32 v79, s18, v79 | |
v_mov_b32_e32 v88, 0x3fa00000 | |
v_mul_f32_e32 v77, v75, v90 | |
v_mul_f32_e32 v79, v86, v79 | |
v_and_b32_e32 v87, s27, v79 | |
v_cmp_gt_f32_e64 s[4:5], v88, v87 | |
v_mul_f32_e32 v88, v87, v87 | |
v_rcp_f32_e32 v89, v88 | |
v_add_f32_e32 v90, -1.0, v87 | |
v_mov_b32_e32 v91, 0xbd777f97 | |
v_lshrrev_b32_e32 v82, 24, v65 | |
v_cndmask_b32_e64 v89, v89, v90, s[4:5] | |
v_mov_b32_e32 v90, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v90, v87 | |
v_cndmask_b32_e64 v88, v89, v88, s[8:9] | |
v_mov_b32_e32 v90, 0xc11d077e | |
v_mov_b32_e32 v89, 0x4036db6e | |
v_madak_f32_e32 v90, v90, v88, 0xc2a2932b | |
v_cmp_gt_f32_e64 s[10:11], v89, v87 | |
v_mov_b32_e32 v89, 0xc3f1c275 | |
v_madak_f32_e32 v89, v89, v88, 0xc480230b | |
v_madak_f32_e32 v90, v88, v90, 0xc3389ae7 | |
v_madak_f32_e32 v89, v88, v89, 0xc41f6441 | |
v_madak_f32_e32 v90, v88, v90, 0xc322658c | |
v_madak_f32_e32 v89, v88, v89, 0xc320a2ea | |
v_madak_f32_e32 v90, v88, v90, 0xc2798057 | |
v_madak_f32_e32 v89, v88, v89, 0xc18e104b | |
v_madak_f32_e32 v90, v88, v90, 0xc128f022 | |
v_madak_f32_e32 v89, v88, v89, 0xbf4c9dd4 | |
v_madak_f32_e32 v90, v88, v90, 0xbf31a0b7 | |
v_madak_f32_e32 v89, v88, v89, 0xbc21a092 | |
v_madak_f32_e32 v90, v88, v90, 0xbc21a093 | |
v_madak_f32_e32 v91, v91, v88, 0x40d23f7c | |
v_cndmask_b32_e64 v89, v89, v90, s[10:11] | |
v_mov_b32_e32 v90, 0xc1b38712 | |
v_madak_f32_e32 v90, v90, v88, 0x43ed43a7 | |
v_madak_f32_e32 v91, v88, v91, 0x42d9451f | |
v_madak_f32_e32 v90, v88, v90, 0x451f90ce | |
v_madak_f32_e32 v91, v88, v91, 0x43d6810b | |
v_madak_f32_e32 v90, v88, v90, 0x4547fdbb | |
v_madak_f32_e32 v91, v88, v91, 0x442158c9 | |
v_madak_f32_e32 v90, v88, v90, 0x44c01759 | |
v_madak_f32_e32 v91, v88, v91, 0x43d9486f | |
v_madak_f32_e32 v90, v88, v90, 0x43a2e571 | |
v_madak_f32_e32 v91, v88, v91, 0x4309a863 | |
v_madak_f32_e32 v90, v88, v90, 0x41f2b459 | |
v_madak_f32_e32 v91, v88, v91, 0x419d35ce | |
v_cndmask_b32_e64 v90, v90, v91, s[10:11] | |
v_mov_b32_e32 v91, 0xbb0df9c0 | |
v_madak_f32_e32 v91, v91, v88, 0x3d1151b3 | |
v_madak_f32_e32 v91, v88, v91, 0xbde31cc2 | |
v_madak_f32_e32 v91, v88, v91, 0x3ea2fe54 | |
v_madak_f32_e32 v91, v88, v91, 0xbebe9208 | |
v_madak_f32_e32 v91, v88, v91, 0x3ed46805 | |
v_madak_f32_e32 v91, v88, v91, 0xbb1acdc6 | |
v_cndmask_b32_e64 v89, v89, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3c445aa3 | |
v_madak_f32_e32 v91, v91, v88, 0x3c5f6e13 | |
v_madak_f32_e32 v91, v88, v91, 0x3e013307 | |
v_madak_f32_e32 v91, v88, v91, 0x3d931ae7 | |
v_madak_f32_e32 v91, v88, v91, 0x3f0a5785 | |
v_madak_f32_e32 v91, v88, v91, 0x3dd9f331 | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0xb7c756b1 | |
v_madak_f32_e32 v91, v91, v88, 0xbbbd1489 | |
v_madak_f32_e32 v91, v88, v91, 0xbce9528f | |
v_madak_f32_e32 v91, v88, v91, 0xbea66beb | |
v_madak_f32_e32 v91, v88, v91, 0x3e0375d4 | |
v_cndmask_b32_e64 v89, v89, v91, s[8:9] | |
v_mov_b32_e32 v91, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v88, 0x390aee49 | |
v_madak_f32_e32 v91, v88, v91, 0x3ba68116 | |
v_madak_f32_e32 v91, v88, v91, 0x3d852a63 | |
v_madak_f32_e32 v91, v88, v91, 0x3ecbbbce | |
v_cndmask_b32_e64 v90, v90, v91, s[8:9] | |
v_mad_f32 v88, v88, v90, 1.0 | |
v_mov_b32_e32 v90, 0x6f800000 | |
v_cmp_gt_f32_e64 s[10:11], |v88|, v90 | |
v_mov_b32_e32 v91, 0x2f800000 | |
v_cndmask_b32_e64 v92, 1.0, v91, s[10:11] | |
v_mul_f32_e32 v88, v92, v88 | |
v_rcp_f32_e32 v88, v88 | |
v_and_b32_e32 v82, 1, v82 | |
v_and_b32_e32 v97, s50, v79 | |
v_mov_b32_e32 v98, 0xbf100000 | |
v_cmp_eq_u32_e32 vcc, 1, v82 | |
v_mad_f32 v98, v97, -v97, v98 | |
v_mul_f32_e32 v88, v88, v89 | |
v_cndmask_b32_e64 v89, 0, 1.0, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v98 | |
v_cndmask_b32_e64 v99, 0.5, -0.5, vcc | |
v_mov_b32_e32 v100, 0x3fb8aa3b | |
v_mac_f32_e32 v99, v100, v98 | |
v_cvt_i32_f32_e32 v99, v99 | |
v_mov_b32_e32 v103, 0xbf317180 | |
v_mov_b32_e32 v105, 0xb717f7d1 | |
v_subrev_f32_e32 v114, v87, v97 | |
v_cvt_f32_i32_e32 v101, v99 | |
v_mul_f32_e32 v102, v88, v92 | |
v_add_f32_e32 v97, v87, v97 | |
v_mad_f32 v97, v97, v114, v102 | |
v_mad_f32 v104, v103, v101, v98 | |
v_mad_f32 v106, v105, v101, v104 | |
v_mul_f32_e32 v107, v106, v106 | |
v_mov_b32_e32 v108, 0xb5ddea0e | |
v_mov_b32_e32 v109, 0x3331bb4c | |
v_cmp_gt_f32_e64 s[10:11], 0, v97 | |
v_mad_f32 v110, v109, v107, v108 | |
v_mov_b32_e32 v111, 0x388ab355 | |
v_cndmask_b32_e64 v114, 0.5, -0.5, s[10:11] | |
v_mad_f32 v110, v110, v107, v111 | |
v_mov_b32_e32 v112, 0xbb360b61 | |
v_mac_f32_e32 v114, v100, v97 | |
v_mad_f32 v110, v110, v107, v112 | |
v_mov_b32_e32 v113, 0x3e2aaaab | |
v_mad_f32 v110, v110, v107, v113 | |
v_cvt_i32_f32_e32 v100, v114 | |
v_mad_f32 v107, -v107, v110, v106 | |
v_sub_f32_e32 v110, 2.0, v107 | |
v_cmp_gt_f32_e64 vcc, |v110|, v90 | |
v_cvt_f32_i32_e32 v115, v100 | |
v_cndmask_b32_e32 v114, 1.0, v91, vcc | |
v_mul_f32_e64 v110, v110, -v114 | |
v_rcp_f32_e32 v110, v110 | |
v_mad_f32 v103, v103, v115, v97 | |
v_mul_f32_e32 v106, v107, v106 | |
v_mad_f32 v107, v105, v115, v103 | |
v_mul_f32_e32 v106, v110, v106 | |
v_mul_f32_e32 v110, v107, v107 | |
v_mac_f32_e32 v108, v109, v110 | |
v_mac_f32_e32 v111, v108, v110 | |
s_mov_b32 m0, -1 | |
v_mac_f32_e32 v112, v111, v110 | |
ds_read_b64 v[82:83], v56 | |
v_mac_f32_e32 v113, v112, v110 | |
v_rcp_f32_e32 v84, v84 | |
v_mul_f32_e32 v93, v86, v86 | |
v_mad_f32 v108, -v110, v113, v107 | |
v_mul_f32_e32 v94, v93, v93 | |
v_sub_f32_e32 v109, 2.0, v108 | |
v_mul_f32_e32 v94, v89, v94 | |
v_cmp_gt_f32_e64 vcc, |v109|, v90 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_cndmask_b32_e32 v110, 1.0, v91, vcc | |
v_mul_f32_e64 v109, v109, -v110 | |
v_mul_f32_e32 v84, v37, v84 | |
v_mul_f32_e32 v106, v106, v114 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v70, v82 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mul_f32_e32 v83, v71, v83 | |
v_mad_f32 v96, v95, v95, s26 | |
v_mad_f32 v101, -v101, v105, v106 | |
v_mul_f32_e32 v84, v85, v84 | |
v_mul_f32_e32 v85, v89, v93 | |
v_mul_f32_e32 v96, v96, v83 | |
v_mul_f32_e32 v94, v82, v94 | |
v_rcp_f32_e32 v109, v109 | |
v_mac_f32_e32 v94, 0x3daaaaaa, v96 | |
v_mac_f32_e32 v84, v86, v85 | |
v_subrev_f32_e32 v85, v104, v101 | |
v_mac_f32_e32 v8, v89, v94 | |
v_lshlrev_b32_e32 v94, 23, v99 | |
v_sub_f32_e32 v85, 1.0, v85 | |
v_mul_f32_e32 v106, v108, v107 | |
v_add_i32_e32 v85, vcc, v85, v94 | |
v_mov_b32_e32 v94, 0xc2aeac4f | |
v_mul_f32_e32 v106, v109, v106 | |
v_cmp_nlt_f32_e32 vcc, v98, v94 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_mul_f32_e32 v106, v106, v110 | |
v_cndmask_b32_e32 v85, 0, v85, vcc | |
v_cmp_lt_f32_e32 vcc, v98, v96 | |
v_mov_b32_e32 v99, 0x7f800000 | |
v_cndmask_b32_e32 v85, v99, v85, vcc | |
v_cmp_u_f32_e32 vcc, v98, v98 | |
v_mad_f32 v105, -v115, v105, v106 | |
v_cndmask_b32_e32 v85, v85, v98, vcc | |
v_subrev_f32_e32 v98, v103, v105 | |
v_sub_f32_e32 v98, 1.0, v98 | |
v_lshlrev_b32_e32 v100, 23, v100 | |
v_add_i32_e32 v98, vcc, v98, v100 | |
v_cmp_nlt_f32_e32 vcc, v97, v94 | |
v_cndmask_b32_e32 v94, 0, v98, vcc | |
v_cmp_lt_f32_e32 vcc, v97, v96 | |
v_cndmask_b32_e32 v94, v99, v94, vcc | |
v_cmp_u_f32_e32 vcc, v97, v97 | |
v_cndmask_b32_e32 v94, v94, v97, vcc | |
v_cmp_gt_f32_e64 vcc, |v87|, v90 | |
v_mov_b32_e32 v90, 0x40c00000 | |
v_cmp_gt_f32_e64 s[10:11], v90, v87 | |
v_mov_b32_e32 v90, 0x31800000 | |
v_cmp_gt_f32_e64 s[12:13], v90, v87 | |
v_cndmask_b32_e32 v90, 1.0, v91, vcc | |
v_mul_f32_e32 v87, v90, v87 | |
v_rcp_f32_e32 v87, v87 | |
v_mul_f32_e32 v85, v94, v85 | |
v_cmp_u_f32_e32 vcc, v79, v79 | |
v_mac_f32_e32 v51, v0, v67 | |
v_mul_f32_e32 v85, v87, v85 | |
v_mad_f32 v85, -v90, v85, 1.0 | |
v_madak_f32_e32 v87, v92, v88, 0x3f58560b | |
v_cndmask_b32_e64 v85, 1.0, v85, s[10:11] | |
v_cndmask_b32_e64 v85, v85, v87, s[4:5] | |
v_and_b32_e32 v87, s51, v79 | |
v_or_b32_e32 v85, v87, v85 | |
v_mad_f32 v87, v102, v79, v79 | |
v_cndmask_b32_e64 v85, v85, v87, s[8:9] | |
v_mul_f32_e32 v87, 0x3f8375d4, v79 | |
v_mac_f32_e32 v87, 0x41000000, v79 | |
v_mul_f32_e32 v87, 0x3e000000, v87 | |
v_cndmask_b32_e64 v85, v85, v87, s[12:13] | |
v_cndmask_b32_e32 v79, v85, v79, vcc | |
v_subrev_f32_e32 v79, v79, v89 | |
v_mul_f32_e64 v85, s19, -v89 | |
v_mac_f32_e32 v85, v79, v86 | |
v_mad_f32 v79, v83, v95, -v82 | |
v_mul_f32_e32 v82, v93, v95 | |
v_mul_f32_e32 v83, v79, v82 | |
v_mac_f32_e32 v83, v84, v77 | |
v_mac_f32_e32 v5, v85, v77 | |
v_mad_f32 v50, v78, v83, v50 | |
v_mad_f32 v49, v80, v83, v49 | |
v_mad_f32 v48, v81, v83, v48 | |
v_mul_f32_e64 v82, v83, -v78 | |
v_mul_f32_e64 v79, v83, -v80 | |
v_mul_f32_e64 v77, v83, -v81 | |
v_mul_f32_e64 v86, v67, -v0 | |
BB6_138: ; %Flow1220 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[38:39] | |
BB6_139: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
v_lshrrev_b32_e32 v67, 25, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_143 | |
s_cbranch_execz BB6_143 | |
BB6_140: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:16 offset1:17 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v64, v76 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_142 | |
s_cbranch_execz BB6_142 | |
BB6_141: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 25, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:64 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v44, v81, v96, v44 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v43, v80, v96, v43 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v77, -v96, v78, v77 | |
v_mad_f32 v42, v78, v96, v42 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v45, v0, v67 | |
BB6_142: ; %Flow1219 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_143: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 26, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_147 | |
s_cbranch_execz BB6_147 | |
BB6_144: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:32 offset1:33 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v63, v76 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_146 | |
s_cbranch_execz BB6_146 | |
BB6_145: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 26, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:128 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v34, v81, v96, v34 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v33, v80, v96, v33 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v77, -v96, v78, v77 | |
v_mad_f32 v32, v78, v96, v32 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v35, v0, v67 | |
BB6_146: ; %Flow1218 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_147: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 27, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_151 | |
s_cbranch_execz BB6_151 | |
BB6_148: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:48 offset1:49 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v62, v76 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_150 | |
s_cbranch_execz BB6_150 | |
BB6_149: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 27, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:192 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v30, v81, v96, v30 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v29, v80, v96, v29 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v77, -v96, v78, v77 | |
v_mad_f32 v28, v78, v96, v28 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v31, v0, v67 | |
BB6_150: ; %Flow1217 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_151: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 28, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_155 | |
s_cbranch_execz BB6_155 | |
BB6_152: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:64 offset1:65 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v61, v76 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_154 | |
s_cbranch_execz BB6_154 | |
BB6_153: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 28, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:256 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v26, v81, v96, v26 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v25, v80, v96, v25 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v77, -v96, v78, v77 | |
v_mad_f32 v24, v78, v96, v24 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v27, v0, v67 | |
BB6_154: ; %Flow1216 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_155: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 29, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_159 | |
s_cbranch_execz BB6_159 | |
BB6_156: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:80 offset1:81 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v60, v76 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_158 | |
s_cbranch_execz BB6_158 | |
BB6_157: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 29, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:320 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v22, v81, v96, v22 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v21, v80, v96, v21 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v77, -v96, v78, v77 | |
v_mad_f32 v20, v78, v96, v20 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v23, v0, v67 | |
BB6_158: ; %Flow1215 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_159: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 30, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_163 | |
s_cbranch_execz BB6_163 | |
BB6_160: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:96 offset1:97 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v59, v76 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s22, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB6_162 | |
s_cbranch_execz BB6_162 | |
BB6_161: ; in Loop: Header=BB6_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 30, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v83 | |
ds_read_b64 v[83:84], v56 offset:384 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, vcc | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s23 | |
v_mul_f32_e32 v87, v75, v90 | |
v_mul_f32_e32 v90, v9, v85 | |
v_mul_f32_e32 v91, v90, v90 | |
v_mov_b32_e32 v92, 0x3a92b707 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_madak_f32_e32 v92, v92, v91, 0x3ded3cb2 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v97, v93, v95 | |
v_mad_f32 v95, v95, v95, s26 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mov_b32_e32 v94, 0x3c739487 | |
v_mul_f32_e32 v84, v95, v84 | |
v_madak_f32_e32 v94, v94, v91, 0x3f01e2bc | |
v_mad_f32 v92, v92, v91, 1.0 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mac_f32_e32 v92, v90, v94 | |
v_mov_b32_e32 v94, 0xb2951928 | |
v_mac_f32_e32 v8, v88, v83 | |
v_rcp_f32_e32 v83, v92 | |
v_madak_f32_e32 v94, v94, v91, 0xb85ffb93 | |
v_mov_b32_e32 v95, 0x35c55945 | |
v_madak_f32_e32 v95, v95, v91, 0x3a83ca0c | |
v_madak_f32_e32 v94, v94, v91, 0xbc9ded90 | |
v_madak_f32_e32 v95, v95, v91, 0x3d8eaf3b | |
v_madak_f32_e32 v91, v94, v91, 0xbf409397 | |
v_mac_f32_e32 v91, v90, v95 | |
v_mul_f32_e32 v83, v37, v83 | |
v_mul_f32_e32 v84, v88, v93 | |
v_mul_f32_e32 v83, v91, v83 | |
v_mul_f32_e32 v96, v96, v97 | |
v_mac_f32_e32 v83, v89, v84 | |
v_mac_f32_e32 v96, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v84, s27, v83 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v85, v84 | |
v_mul_f32_e32 v85, v84, v84 | |
v_rcp_f32_e32 v90, v85 | |
v_add_f32_e32 v91, -1.0, v84 | |
v_mov_b32_e32 v93, 0xbd777f97 | |
v_mov_b32_e32 v94, 0x4036db6e | |
v_cndmask_b32_e64 v90, v90, v91, s[4:5] | |
v_mov_b32_e32 v91, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v91, v84 | |
v_cndmask_b32_e64 v85, v90, v85, s[8:9] | |
v_mov_b32_e32 v91, 0xc1b38712 | |
v_madak_f32_e32 v93, v93, v85, 0x40d23f7c | |
v_madak_f32_e32 v91, v91, v85, 0x43ed43a7 | |
v_madak_f32_e32 v93, v85, v93, 0x42d9451f | |
v_madak_f32_e32 v91, v85, v91, 0x451f90ce | |
v_madak_f32_e32 v93, v85, v93, 0x43d6810b | |
v_madak_f32_e32 v91, v85, v91, 0x4547fdbb | |
v_madak_f32_e32 v93, v85, v93, 0x442158c9 | |
v_madak_f32_e32 v91, v85, v91, 0x44c01759 | |
v_madak_f32_e32 v93, v85, v93, 0x43d9486f | |
v_madak_f32_e32 v91, v85, v91, 0x43a2e571 | |
v_madak_f32_e32 v93, v85, v93, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v94, v84 | |
v_madak_f32_e32 v91, v85, v91, 0x41f2b459 | |
v_madak_f32_e32 v93, v85, v93, 0x419d35ce | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_mov_b32_e32 v93, 0x3c445aa3 | |
v_madak_f32_e32 v93, v93, v85, 0x3c5f6e13 | |
v_madak_f32_e32 v93, v85, v93, 0x3e013307 | |
v_madak_f32_e32 v93, v85, v93, 0x3d931ae7 | |
v_madak_f32_e32 v93, v85, v93, 0x3f0a5785 | |
v_mov_b32_e32 v92, 0xc11d077e | |
v_madak_f32_e32 v93, v85, v93, 0x3dd9f331 | |
v_mov_b32_e32 v90, 0xc3f1c275 | |
v_madak_f32_e32 v92, v92, v85, 0xc2a2932b | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v85, 0xc480230b | |
v_madak_f32_e32 v92, v85, v92, 0xc3389ae7 | |
v_madak_f32_e32 v93, v93, v85, 0x390aee49 | |
v_madak_f32_e32 v90, v85, v90, 0xc41f6441 | |
v_madak_f32_e32 v92, v85, v92, 0xc322658c | |
v_madak_f32_e32 v93, v85, v93, 0x3ba68116 | |
v_madak_f32_e32 v90, v85, v90, 0xc320a2ea | |
v_madak_f32_e32 v92, v85, v92, 0xc2798057 | |
v_madak_f32_e32 v93, v85, v93, 0x3d852a63 | |
v_madak_f32_e32 v90, v85, v90, 0xc18e104b | |
v_madak_f32_e32 v92, v85, v92, 0xc128f022 | |
v_madak_f32_e32 v93, v85, v93, 0x3ecbbbce | |
v_madak_f32_e32 v90, v85, v90, 0xbf4c9dd4 | |
v_madak_f32_e32 v92, v85, v92, 0xbf31a0b7 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_and_b32_e32 v93, s50, v83 | |
v_mov_b32_e32 v94, 0xbf100000 | |
v_madak_f32_e32 v90, v85, v90, 0xbc21a092 | |
v_madak_f32_e32 v92, v85, v92, 0xbc21a093 | |
v_mad_f32 v94, v93, -v93, v94 | |
v_cndmask_b32_e32 v90, v90, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_mov_b32_e32 v92, 0xbb0df9c0 | |
v_madak_f32_e32 v92, v92, v85, 0x3d1151b3 | |
v_madak_f32_e32 v92, v85, v92, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v95 | |
v_madak_f32_e32 v92, v85, v92, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v92, v85, v92, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v94 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v92, v85, v92, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v92, v85, v92, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v90, v90, v92, s[4:5] | |
v_mov_b32_e32 v92, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v92, v92, v85, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v92, v85, v92, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v92, v85, v92, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v92, v85, v92, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v85, v85, v91, 1.0 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v85|, v91 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v91 | |
v_mul_f32_e32 v85, v99, v85 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v85, v85 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v90, v90, v92, s[8:9] | |
v_lshlrev_b32_e32 v92, 23, v95 | |
v_mul_f32_e32 v85, v85, v90 | |
v_mul_f32_e32 v90, v105, v104 | |
v_mul_f32_e32 v90, v108, v90 | |
v_mul_f32_e32 v90, v90, v112 | |
v_mad_f32 v90, -v100, v103, v90 | |
v_subrev_f32_e32 v90, v102, v90 | |
v_sub_f32_e32 v90, 1.0, v90 | |
v_add_i32_e32 v90, vcc, v90, v92 | |
v_mov_b32_e32 v92, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v94, v92 | |
v_mov_b32_e32 v95, 0x42b17218 | |
v_cndmask_b32_e32 v90, 0, v90, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v95 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v90, v100, v90, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v90, v90, v94, vcc | |
v_subrev_f32_e32 v94, v84, v93 | |
v_mul_f32_e32 v102, v85, v99 | |
v_add_f32_e32 v93, v84, v93 | |
v_mad_f32 v93, v93, v94, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v93 | |
v_cndmask_b32_e64 v94, 0.5, -0.5, vcc | |
v_mac_f32_e32 v94, v97, v93 | |
v_cvt_i32_f32_e32 v94, v94 | |
v_madak_f32_e32 v85, v99, v85, 0x3f58560b | |
v_mad_f32 v82, -v96, v81, v82 | |
v_mad_f32 v18, v81, v96, v18 | |
v_cvt_f32_i32_e32 v97, v94 | |
v_lshlrev_b32_e32 v94, 23, v94 | |
v_mad_f32 v79, -v96, v80, v79 | |
v_mad_f32 v17, v80, v96, v17 | |
v_mad_f32 v101, v101, v97, v93 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v91 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v77, -v96, v78, v77 | |
v_mad_f32 v16, v78, v96, v16 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v94, vcc, v97, v94 | |
v_cmp_nlt_f32_e32 vcc, v93, v92 | |
v_cndmask_b32_e32 v92, 0, v94, vcc | |
v_cmp_lt_f32_e32 vcc, v93, v95 | |
v_cndmask_b32_e32 v92, v100, v92, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_cndmask_b32_e32 v91, 1.0, v98, vcc | |
v_mul_f32_e32 v94, v91, v84 | |
v_rcp_f32_e32 v94, v94 | |
v_cmp_u_f32_e32 vcc, v93, v93 | |
v_cndmask_b32_e32 v92, v92, v93, vcc | |
v_mul_f32_e32 v90, v92, v90 | |
v_mov_b32_e32 v92, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v92, v84 | |
v_mov_b32_e32 v92, 0x31800000 | |
v_mul_f32_e32 v90, v94, v90 | |
v_cmp_gt_f32_e64 s[10:11], v92, v84 | |
v_mad_f32 v84, -v91, v90, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v85, s[4:5] | |
v_and_b32_e32 v85, s51, v83 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v102, v83, v83 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mul_f32_e32 v85, 0x3f8375d4, v83 | |
v_mac_f32_e32 v85, 0x41000000, v83 | |
v_mul_f32_e32 v85, 0x3e000000, v85 | |
v_cndmask_b32_e64 v84, v84, v85, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v84, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v84, s19, -v88 | |
v_mac_f32_e32 v84, v83, v89 | |
v_mac_f32_e32 v5, v84, v87 | |
v_mac_f32_e32 v19, v0, v67 | |
BB6_162: ; %Flow1214 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB6_163: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_cmp_gt_i32_e32 vcc, 0, v69 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[10:11], exec, s[4:5] | |
; mask branch BB6_167 | |
s_cbranch_execz BB6_167 | |
BB6_164: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:112 offset1:113 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v38, v76 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v73, v73, v88 | |
v_subrev_f32_e32 v69, v72, v87 | |
v_mul_f32_e32 v72, v73, v73 | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_cndmask_b32_e64 v76, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v74, v74, v89 | |
v_mac_f32_e32 v72, v69, v69 | |
v_mac_f32_e32 v72, v74, v74 | |
v_mul_f32_e32 v76, s22, v76 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v72, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB6_166 | |
s_cbranch_execz BB6_166 | |
BB6_165: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
v_mul_f32_e32 v78, v75, v90 | |
ds_read_b64 v[75:76], v56 offset:448 | |
v_cmp_gt_i32_e32 vcc, 0, v65 | |
v_mad_f32 v65, -v67, v0, v86 | |
v_mov_b32_e32 v84, 0x3a92b707 | |
v_mov_b32_e32 v85, 0x3c739487 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v65, v70, v75 | |
v_mul_f32_e32 v70, v71, v76 | |
v_max_f32_e32 v71, 0x34cd15ae, v72 | |
v_rsq_f32_e32 v72, v71 | |
v_cndmask_b32_e64 v75, 0, 1.0, vcc | |
v_mov_b32_e32 v86, 0x35c55945 | |
v_mov_b32_e32 v87, 0xbd777f97 | |
v_mul_f32_e32 v76, v72, v72 | |
v_mul_f32_e32 v80, v76, v76 | |
v_mul_f32_e32 v80, v75, v80 | |
v_mul_f32_e32 v81, v76, v80 | |
v_mad_f32 v80, v80, v76, s23 | |
v_mad_f32 v83, v81, v81, s26 | |
v_mul_f32_e32 v80, 0xbe2aaaab, v80 | |
v_mul_f32_e32 v80, v65, v80 | |
v_mul_f32_e32 v83, v83, v70 | |
v_mac_f32_e32 v80, 0x3daaaaaa, v83 | |
v_mac_f32_e32 v8, v75, v80 | |
v_mul_f32_e32 v80, v9, v71 | |
v_mul_f32_e32 v83, v80, v80 | |
v_madak_f32_e32 v84, v84, v83, 0x3ded3cb2 | |
v_madak_f32_e32 v85, v85, v83, 0x3f01e2bc | |
v_mad_f32 v84, v84, v83, 1.0 | |
v_mac_f32_e32 v84, v80, v85 | |
v_mov_b32_e32 v85, 0xb2951928 | |
v_rcp_f32_e32 v84, v84 | |
v_madak_f32_e32 v85, v85, v83, 0xb85ffb93 | |
v_madak_f32_e32 v86, v86, v83, 0x3a83ca0c | |
v_madak_f32_e32 v85, v85, v83, 0xbc9ded90 | |
v_madak_f32_e32 v86, v86, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v85, v83, 0xbf409397 | |
v_mul_f32_e32 v71, s18, v71 | |
v_mac_f32_e32 v83, v80, v86 | |
v_mul_f32_e32 v84, v37, v84 | |
v_mul_f32_e32 v71, v72, v71 | |
v_mul_f32_e32 v80, v75, v76 | |
v_mul_f32_e32 v83, v83, v84 | |
v_mac_f32_e32 v83, v72, v80 | |
v_and_b32_e32 v80, s27, v71 | |
v_mov_b32_e32 v85, 0x3fa00000 | |
v_mul_f32_e32 v84, v80, v80 | |
v_cmp_gt_f32_e64 s[4:5], v85, v80 | |
v_rcp_f32_e32 v85, v84 | |
v_add_f32_e32 v86, -1.0, v80 | |
v_mov_b32_e32 v88, 0x6f800000 | |
v_and_b32_e32 v91, s50, v71 | |
v_cndmask_b32_e64 v85, v85, v86, s[4:5] | |
v_mov_b32_e32 v86, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v86, v80 | |
v_cndmask_b32_e64 v84, v85, v84, s[8:9] | |
v_mov_b32_e32 v86, 0xc11d077e | |
v_mov_b32_e32 v85, 0x4036db6e | |
v_madak_f32_e32 v86, v86, v84, 0xc2a2932b | |
v_cmp_gt_f32_e32 vcc, v85, v80 | |
v_mov_b32_e32 v85, 0xc3f1c275 | |
v_madak_f32_e32 v85, v85, v84, 0xc480230b | |
v_madak_f32_e32 v86, v84, v86, 0xc3389ae7 | |
v_madak_f32_e32 v85, v84, v85, 0xc41f6441 | |
v_madak_f32_e32 v86, v84, v86, 0xc322658c | |
v_madak_f32_e32 v85, v84, v85, 0xc320a2ea | |
v_madak_f32_e32 v86, v84, v86, 0xc2798057 | |
v_madak_f32_e32 v85, v84, v85, 0xc18e104b | |
v_madak_f32_e32 v86, v84, v86, 0xc128f022 | |
v_madak_f32_e32 v85, v84, v85, 0xbf4c9dd4 | |
v_madak_f32_e32 v86, v84, v86, 0xbf31a0b7 | |
v_madak_f32_e32 v85, v84, v85, 0xbc21a092 | |
v_madak_f32_e32 v86, v84, v86, 0xbc21a093 | |
v_madak_f32_e32 v87, v87, v84, 0x40d23f7c | |
v_cndmask_b32_e32 v85, v85, v86, vcc | |
v_mov_b32_e32 v86, 0xc1b38712 | |
v_madak_f32_e32 v86, v86, v84, 0x43ed43a7 | |
v_madak_f32_e32 v87, v84, v87, 0x42d9451f | |
v_madak_f32_e32 v86, v84, v86, 0x451f90ce | |
v_madak_f32_e32 v87, v84, v87, 0x43d6810b | |
v_madak_f32_e32 v86, v84, v86, 0x4547fdbb | |
v_madak_f32_e32 v87, v84, v87, 0x442158c9 | |
v_madak_f32_e32 v86, v84, v86, 0x44c01759 | |
v_madak_f32_e32 v87, v84, v87, 0x43d9486f | |
v_madak_f32_e32 v86, v84, v86, 0x43a2e571 | |
v_madak_f32_e32 v87, v84, v87, 0x4309a863 | |
v_madak_f32_e32 v86, v84, v86, 0x41f2b459 | |
v_madak_f32_e32 v87, v84, v87, 0x419d35ce | |
v_cndmask_b32_e32 v86, v86, v87, vcc | |
v_mov_b32_e32 v87, 0xbb0df9c0 | |
v_madak_f32_e32 v87, v87, v84, 0x3d1151b3 | |
v_madak_f32_e32 v87, v84, v87, 0xbde31cc2 | |
v_madak_f32_e32 v87, v84, v87, 0x3ea2fe54 | |
v_madak_f32_e32 v87, v84, v87, 0xbebe9208 | |
v_madak_f32_e32 v87, v84, v87, 0x3ed46805 | |
v_madak_f32_e32 v87, v84, v87, 0xbb1acdc6 | |
v_cndmask_b32_e64 v85, v85, v87, s[4:5] | |
v_mov_b32_e32 v87, 0x3c445aa3 | |
v_madak_f32_e32 v87, v87, v84, 0x3c5f6e13 | |
v_madak_f32_e32 v87, v84, v87, 0x3e013307 | |
v_madak_f32_e32 v87, v84, v87, 0x3d931ae7 | |
v_madak_f32_e32 v87, v84, v87, 0x3f0a5785 | |
v_madak_f32_e32 v87, v84, v87, 0x3dd9f331 | |
v_cndmask_b32_e64 v86, v86, v87, s[4:5] | |
v_mov_b32_e32 v87, 0xb684e21a | |
v_madak_f32_e32 v87, v87, v84, 0x390aee49 | |
v_madak_f32_e32 v87, v84, v87, 0x3ba68116 | |
v_madak_f32_e32 v87, v84, v87, 0x3d852a63 | |
v_madak_f32_e32 v87, v84, v87, 0x3ecbbbce | |
v_cndmask_b32_e64 v86, v86, v87, s[8:9] | |
v_mov_b32_e32 v92, 0xbf100000 | |
v_mad_f32 v86, v84, v86, 1.0 | |
v_cmp_gt_f32_e64 vcc, |v86|, v88 | |
v_mov_b32_e32 v89, 0x2f800000 | |
v_mad_f32 v92, v91, -v91, v92 | |
v_cndmask_b32_e32 v90, 1.0, v89, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v92 | |
v_cndmask_b32_e64 v93, 0.5, -0.5, vcc | |
v_mov_b32_e32 v94, 0x3fb8aa3b | |
v_mac_f32_e32 v93, v94, v92 | |
v_mov_b32_e32 v87, 0xb7c756b1 | |
v_cvt_i32_f32_e32 v93, v93 | |
v_madak_f32_e32 v87, v87, v84, 0xbbbd1489 | |
v_madak_f32_e32 v87, v84, v87, 0xbce9528f | |
v_madak_f32_e32 v87, v84, v87, 0xbea66beb | |
v_mul_f32_e32 v86, v90, v86 | |
v_madak_f32_e32 v84, v84, v87, 0x3e0375d4 | |
v_rcp_f32_e32 v86, v86 | |
v_cvt_f32_i32_e32 v87, v93 | |
v_cndmask_b32_e64 v84, v85, v84, s[8:9] | |
v_mov_b32_e32 v85, 0xbf317180 | |
v_mul_f32_e32 v84, v86, v84 | |
v_mad_f32 v86, v85, v87, v92 | |
v_mov_b32_e32 v95, 0xb717f7d1 | |
v_mad_f32 v96, v95, v87, v86 | |
v_mul_f32_e32 v97, v96, v96 | |
v_mov_b32_e32 v98, 0xb5ddea0e | |
v_mov_b32_e32 v99, 0x3331bb4c | |
v_mad_f32 v100, v99, v97, v98 | |
v_mov_b32_e32 v101, 0x388ab355 | |
v_mad_f32 v100, v100, v97, v101 | |
v_mov_b32_e32 v102, 0xbb360b61 | |
v_mad_f32 v100, v100, v97, v102 | |
v_mov_b32_e32 v103, 0x3e2aaaab | |
v_mad_f32 v100, v100, v97, v103 | |
v_mad_f32 v97, -v97, v100, v96 | |
v_mul_f32_e32 v96, v97, v96 | |
v_sub_f32_e32 v97, 2.0, v97 | |
v_cmp_gt_f32_e64 vcc, |v97|, v88 | |
v_cndmask_b32_e32 v100, 1.0, v89, vcc | |
v_mul_f32_e64 v97, v97, -v100 | |
v_rcp_f32_e32 v97, v97 | |
v_mad_f32 v65, v70, v81, -v65 | |
v_mul_f32_e32 v70, v76, v81 | |
v_mul_f32_e32 v65, v65, v70 | |
v_mul_f32_e32 v96, v97, v96 | |
v_mul_f32_e32 v96, v96, v100 | |
v_mad_f32 v87, -v87, v95, v96 | |
v_subrev_f32_e32 v86, v86, v87 | |
v_lshlrev_b32_e32 v87, 23, v93 | |
v_sub_f32_e32 v86, 1.0, v86 | |
v_add_i32_e32 v86, vcc, v86, v87 | |
v_mov_b32_e32 v87, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v92, v87 | |
v_mov_b32_e32 v93, 0x42b17218 | |
v_cndmask_b32_e32 v86, 0, v86, vcc | |
v_cmp_lt_f32_e32 vcc, v92, v93 | |
v_mov_b32_e32 v96, 0x7f800000 | |
v_cndmask_b32_e32 v86, v96, v86, vcc | |
v_cmp_u_f32_e32 vcc, v92, v92 | |
v_cndmask_b32_e32 v86, v86, v92, vcc | |
v_subrev_f32_e32 v92, v80, v91 | |
v_mul_f32_e32 v97, v84, v90 | |
v_add_f32_e32 v91, v80, v91 | |
v_mad_f32 v91, v91, v92, v97 | |
v_cmp_gt_f32_e32 vcc, 0, v91 | |
v_cndmask_b32_e64 v92, 0.5, -0.5, vcc | |
v_mac_f32_e32 v92, v94, v91 | |
v_cvt_i32_f32_e32 v92, v92 | |
v_madak_f32_e32 v84, v90, v84, 0x3f58560b | |
v_mac_f32_e32 v65, v83, v78 | |
v_mad_f32 v82, -v65, v74, v82 | |
v_cvt_f32_i32_e32 v94, v92 | |
v_lshlrev_b32_e32 v92, 23, v92 | |
v_mad_f32 v15, v74, v65, v15 | |
v_mad_f32 v79, -v65, v73, v79 | |
v_mad_f32 v85, v85, v94, v91 | |
v_mad_f32 v100, v95, v94, v85 | |
v_mul_f32_e32 v104, v100, v100 | |
v_mac_f32_e32 v98, v99, v104 | |
v_mac_f32_e32 v101, v98, v104 | |
v_mac_f32_e32 v102, v101, v104 | |
v_mac_f32_e32 v103, v102, v104 | |
v_mad_f32 v98, -v104, v103, v100 | |
v_sub_f32_e32 v99, 2.0, v98 | |
v_cmp_gt_f32_e64 vcc, |v99|, v88 | |
v_cndmask_b32_e32 v101, 1.0, v89, vcc | |
v_mul_f32_e64 v99, v99, -v101 | |
v_rcp_f32_e32 v99, v99 | |
v_mul_f32_e32 v98, v98, v100 | |
v_mad_f32 v14, v73, v65, v14 | |
v_mad_f32 v77, -v65, v69, v77 | |
v_mul_f32_e32 v98, v99, v98 | |
v_mul_f32_e32 v98, v98, v101 | |
v_mad_f32 v94, -v94, v95, v98 | |
v_subrev_f32_e32 v85, v85, v94 | |
v_sub_f32_e32 v85, 1.0, v85 | |
v_add_i32_e32 v85, vcc, v85, v92 | |
v_cmp_nlt_f32_e32 vcc, v91, v87 | |
v_cndmask_b32_e32 v85, 0, v85, vcc | |
v_cmp_lt_f32_e32 vcc, v91, v93 | |
v_cndmask_b32_e32 v85, v96, v85, vcc | |
v_cmp_u_f32_e32 vcc, v91, v91 | |
v_cndmask_b32_e32 v85, v85, v91, vcc | |
v_cmp_gt_f32_e64 vcc, |v80|, v88 | |
v_mul_f32_e32 v85, v85, v86 | |
v_cndmask_b32_e32 v86, 1.0, v89, vcc | |
v_mul_f32_e32 v87, v86, v80 | |
v_rcp_f32_e32 v87, v87 | |
v_mad_f32 v13, v69, v65, v13 | |
v_mac_f32_e32 v68, v0, v67 | |
v_mul_f32_e32 v85, v87, v85 | |
v_mad_f32 v85, -v86, v85, 1.0 | |
v_mov_b32_e32 v86, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v86, v80 | |
v_cndmask_b32_e32 v85, 1.0, v85, vcc | |
v_cndmask_b32_e64 v84, v85, v84, s[4:5] | |
v_and_b32_e32 v85, s51, v71 | |
v_or_b32_e32 v84, v85, v84 | |
v_mad_f32 v85, v97, v71, v71 | |
v_cndmask_b32_e64 v84, v84, v85, s[8:9] | |
v_mov_b32_e32 v85, 0x31800000 | |
v_cmp_gt_f32_e32 vcc, v85, v80 | |
v_mul_f32_e32 v80, 0x3f8375d4, v71 | |
v_mac_f32_e32 v80, 0x41000000, v71 | |
v_mul_f32_e32 v80, 0x3e000000, v80 | |
v_cndmask_b32_e32 v80, v84, v80, vcc | |
v_cmp_u_f32_e32 vcc, v71, v71 | |
v_cndmask_b32_e32 v71, v80, v71, vcc | |
v_subrev_f32_e32 v71, v71, v75 | |
v_mul_f32_e64 v75, s19, -v75 | |
v_mac_f32_e32 v75, v71, v72 | |
v_mac_f32_e32 v5, v75, v78 | |
BB6_166: ; %Flow1213 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
BB6_167: ; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[10:11] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v6, v77 | |
ds_write_b32 v7, v79 | |
ds_write_b32 v12, v82 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_173 | |
s_cbranch_execz BB6_173 | |
BB6_168: ; in Loop: Header=BB6_11 Depth=1 | |
v_lshlrev_b32_e32 v67, 6, v2 | |
v_add_i32_e32 v65, vcc, v11, v67 | |
v_lshlrev_b32_e32 v65, 2, v65 | |
v_add_i32_e32 v69, vcc, s15, v65 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v65, v69 | |
v_add_i32_e32 v70, vcc, 8, v11 | |
v_or_b32_e32 v71, 1, v11 | |
v_cmp_lt_i32_e32 vcc, v71, v70 | |
s_and_saveexec_b64 s[8:9], vcc | |
s_xor_b64 s[8:9], exec, s[8:9] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_170 | |
s_cbranch_execz BB6_170 | |
BB6_169: ; in Loop: Header=BB6_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[70:71], v69 offset0:1 offset1:2 | |
v_or_b32_e32 v74, 3, v11 | |
v_add_i32_e32 v67, vcc, v74, v67 | |
v_lshlrev_b32_e32 v67, 2, v67 | |
ds_read2_b32 v[72:73], v69 offset0:3 offset1:4 | |
v_add_i32_e32 v67, vcc, s15, v67 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v65, v65, v70 | |
ds_read_b32 v75, v69 offset:28 | |
ds_read2_b32 v[69:70], v67 offset0:2 offset1:3 | |
v_add_f32_e32 v65, v71, v65 | |
v_add_f32_e32 v65, v72, v65 | |
v_add_f32_e32 v65, v73, v65 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v65, v69, v65 | |
v_add_f32_e32 v65, v70, v65 | |
v_add_f32_e32 v65, v75, v65 | |
BB6_170: ; %._crit_edge.i | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
v_mul_lo_i32 v66, v66, 3 | |
v_mov_b32_e32 v70, s29 | |
s_mov_b64 s[8:9], s[28:29] | |
s_mov_b64 s[10:11], s[46:47] | |
v_add_i32_e32 v66, vcc, v66, v2 | |
v_ashrrev_i32_e32 v67, 31, v66 | |
v_lshl_b64 v[66:67], v[66:67], 2 | |
v_add_i32_e32 v69, vcc, s28, v66 | |
v_addc_u32_e32 v70, vcc, v67, v70, vcc | |
buffer_load_dword v67, v[66:67], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_waitcnt vmcnt(0) | |
BB6_171: ; Parent Loop BB6_11 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_mov_b32_e32 v66, -1 | |
v_add_f32_e32 v66, v65, v67 | |
v_mov_b32_e32 v72, v67 | |
v_mov_b32_e32 v71, v66 | |
buffer_atomic_cmpswap v[71:72], v[69:70], s[44:47], 0 addr64 glc | |
v_mov_b32_e32 v66, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v71, v67 | |
s_or_b64 s[8:9], vcc, s[8:9] | |
v_mov_b32_e32 v67, v71 | |
s_andn2_b64 exec, exec, s[8:9] | |
s_cbranch_execnz BB6_171 | |
; BB#172: ; %Flow1211 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
BB6_173: ; %Flow1212 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
BB6_174: ; %Flow1221 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB6_175: ; %Flow1255 | |
; in Loop: Header=BB6_11 Depth=1 | |
s_or_b64 exec, exec, s[52:53] | |
v_add_i32_e32 v57, vcc, 1, v57 | |
v_addc_u32_e32 v58, vcc, 0, v58, vcc | |
v_cmp_ne_u32_e32 vcc, v57, v39 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB6_11 | |
BB6_176: ; %Flow1257 | |
s_mov_b32 m0, -1 | |
ds_write_b32 v6, v48 | |
ds_write_b32 v7, v49 | |
ds_write_b32 v12, v50 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_load_dword s2, s[6:7], 0x32 | |
s_and_b64 s[0:1], exec, s[0:1] | |
v_lshlrev_b32_e32 v10, 2, v40 | |
s_xor_b64 s[0:1], s[0:1], -1 | |
v_mov_b32_e32 v3, 0 | |
s_waitcnt lgkmcnt(0) | |
v_cmp_ne_u32_e64 s[2:3], s2, 0 | |
s_and_b64 s[2:3], s[2:3], s[0:1] | |
v_lshlrev_b32_e32 v9, 6, v36 | |
v_add_i32_e32 v10, vcc, s15, v10 | |
v_add_i32_e32 v23, vcc, 64, v2 | |
v_add_i32_e32 v19, vcc, 0x80, v2 | |
v_cmp_gt_i32_e64 s[0:1], 4, v1 | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB6_187 | |
s_cbranch_execz BB6_187 | |
BB6_177: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v3, v10 offset:128 | |
ds_read_b32 v27, v10 | |
v_add_i32_e32 v31, vcc, v11, v23 | |
v_lshlrev_b32_e32 v31, 2, v31 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v27 | |
ds_write_b32 v10, v3 | |
v_add_i32_e32 v27, vcc, s15, v31 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v3, v27 offset:128 | |
ds_read_b32 v31, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v31 | |
ds_write_b32 v10, v3 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v3, vcc, v11, v19 | |
v_lshlrev_b32_e32 v3, 2, v3 | |
v_add_i32_e32 v31, vcc, s15, v3 | |
ds_read_b32 v3, v31 offset:128 | |
ds_read_b32 v35, v10 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v35 | |
v_mov_b32_e32 v35, 0 | |
ds_write_b32 v10, v3 offset:512 | |
s_waitcnt lgkmcnt(0) | |
; implicit-def: %VGPR3 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_179 | |
BB6_178: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_mov_b32_e32 v3, 0 | |
v_cndmask_b32_e64 v35, 0, -1, vcc | |
BB6_179: ; %Flow1208 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB6_181 | |
s_cbranch_execz BB6_181 | |
BB6_180: ; %.thread85.i | |
s_mov_b32 m0, -1 | |
ds_read_b32 v35, v10 offset:64 | |
ds_read_b32 v36, v10 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v35, v35, v36 | |
ds_write_b32 v10, v35 | |
ds_read_b32 v27, v27 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v35, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v35 | |
ds_write_b32 v10, v27 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v27, v31 offset:64 | |
ds_read_b32 v31, v10 offset:512 | |
v_mov_b32_e32 v35, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v31 | |
ds_write_b32 v10, v27 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB6_181: ; %Flow1209 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v35 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_186 | |
s_cbranch_execz BB6_186 | |
BB6_182: | |
v_add_i32_e32 v3, vcc, v9, v2 | |
v_mul_lo_i32 v3, v3, 3 | |
v_mov_b32_e32 v27, 0xe0 | |
v_mad_i32_i24 v27, v27, v1, v10 | |
s_mov_b32 m0, -1 | |
v_add_i32_e32 v35, vcc, v1, v3 | |
v_ashrrev_i32_e32 v36, 31, v35 | |
v_lshl_b64 v[37:38], v[35:36], 2 | |
v_add_i32_e32 v35, vcc, s28, v37 | |
v_mov_b32_e32 v3, s29 | |
s_mov_b32 s11, 0xf000 | |
s_mov_b32 s10, 0 | |
s_mov_b64 s[8:9], s[28:29] | |
ds_read_b32 v31, v27 | |
ds_read_b32 v27, v27 offset:32 | |
v_addc_u32_e32 v36, vcc, v38, v3, vcc | |
buffer_load_dword v38, v[37:38], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_mov_b64 s[12:13], s[8:9] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v31, v27 | |
s_waitcnt vmcnt(0) | |
BB6_183: ; =>This Inner Loop Header: Depth=1 | |
v_add_f32_e32 v37, v27, v38 | |
v_mov_b32_e32 v40, v38 | |
v_mov_b32_e32 v39, v37 | |
buffer_atomic_cmpswap v[39:40], v[35:36], s[8:11], 0 addr64 glc | |
v_mov_b32_e32 v3, -1 | |
v_mov_b32_e32 v3, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v39, v38 | |
s_or_b64 s[12:13], vcc, s[12:13] | |
v_mov_b32_e32 v38, v39 | |
s_andn2_b64 exec, exec, s[12:13] | |
s_cbranch_execnz BB6_183 | |
; BB#184: ; %atomicAdd_g_f.exit.i | |
s_or_b64 exec, exec, s[12:13] | |
s_and_b64 s[8:9], exec, s[2:3] | |
v_cndmask_b32_e64 v31, 0, 1, s[8:9] | |
v_cmp_ne_u32_e32 vcc, 1, v31 | |
v_mov_b32_e32 v3, 0 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB6_186 | |
; BB#185: | |
v_mov_b32_e32 v3, v27 | |
BB6_186: ; %Flow1210 | |
s_or_b64 exec, exec, s[6:7] | |
BB6_187: ; %reduce_force_i_pow2.exit | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v6, v42 | |
ds_write_b32 v7, v43 | |
ds_write_b32 v12, v44 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB6_198 | |
s_cbranch_execz BB6_198 | |
BB6_188: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v27, v10 offset:128 | |
ds_read_b32 v31, v10 | |
v_add_i32_e32 v35, vcc, v11, v23 | |
v_lshlrev_b32_e32 v35, 2, v35 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v31 | |
ds_write_b32 v10, v27 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v27, vcc, s15, v35 | |
ds_read_b32 v31, v27 offset:128 | |
ds_read_b32 v35, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v31, v31, v35 | |
ds_write_b32 v10, v31 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v31, vcc, v11, v19 | |
v_lshlrev_b32_e32 v31, 2, v31 | |
v_add_i32_e32 v31, vcc, s15, v31 | |
ds_read_b32 v35, v31 offset:128 | |
ds_read_b32 v36, v10 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v35, v35, v36 | |
ds_write_b32 v10, v35 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v35, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_190 | |
BB6_189: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v35, 0, -1, vcc | |
BB6_190: ; %Flow1205 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB6_192 | |
s_cbranch_execz BB6_192 | |
BB6_191: ; %.thread85.i508 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v35, v10 offset:64 | |
ds_read_b32 v36, v10 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v35, v35, v36 | |
ds_write_b32 v10, v35 | |
ds_read_b32 v27, v27 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v35, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v35 | |
ds_write_b32 v10, v27 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v27, v31 offset:64 | |
ds_read_b32 v31, v10 offset:512 | |
v_mov_b32_e32 v35, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v31 | |
ds_write_b32 v10, v27 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB6_192: ; %Flow1206 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v35 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_197 | |
s_cbranch_execz BB6_197 | |
BB6_193: | |
v_or_b32_e32 v27, 8, v9 | |
v_add_i32_e32 v27, vcc, v27, v2 | |
v_mul_lo_i32 v31, v27, 3 | |
v_mov_b32_e32 v27, 0xe0 | |
v_mad_i32_i24 v27, v27, v1, v10 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v35, v27 | |
ds_read_b32 v27, v27 offset:32 | |
s_mov_b32 s11, 0xf000 | |
s_mov_b32 s10, 0 | |
s_mov_b64 s[8:9], s[28:29] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v35, v27 | |
v_add_i32_e32 v35, vcc, v1, v31 | |
v_ashrrev_i32_e32 v36, 31, v35 | |
v_lshl_b64 v[37:38], v[35:36], 2 | |
v_add_i32_e32 v35, vcc, s28, v37 | |
v_mov_b32_e32 v31, s29 | |
v_addc_u32_e32 v36, vcc, v38, v31, vcc | |
buffer_load_dword v38, v[37:38], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_mov_b64 s[12:13], s[8:9] | |
s_waitcnt vmcnt(0) | |
BB6_194: ; =>This Inner Loop Header: Depth=1 | |
v_add_f32_e32 v37, v27, v38 | |
v_mov_b32_e32 v40, v38 | |
v_mov_b32_e32 v39, v37 | |
buffer_atomic_cmpswap v[39:40], v[35:36], s[8:11], 0 addr64 glc | |
v_mov_b32_e32 v31, -1 | |
v_mov_b32_e32 v31, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v39, v38 | |
s_or_b64 s[12:13], vcc, s[12:13] | |
v_mov_b32_e32 v38, v39 | |
s_andn2_b64 exec, exec, s[12:13] | |
s_cbranch_execnz BB6_194 | |
; BB#195: ; %atomicAdd_g_f.exit.i496 | |
s_or_b64 exec, exec, s[12:13] | |
s_and_b64 s[8:9], exec, s[2:3] | |
v_cndmask_b32_e64 v31, 0, 1, s[8:9] | |
v_cmp_ne_u32_e32 vcc, 1, v31 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB6_197 | |
; BB#196: | |
v_add_f32_e32 v3, v27, v3 | |
BB6_197: ; %Flow1207 | |
s_or_b64 exec, exec, s[6:7] | |
BB6_198: ; %reduce_force_i_pow2.exit510 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v6, v32 | |
ds_write_b32 v7, v33 | |
ds_write_b32 v12, v34 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB6_209 | |
s_cbranch_execz BB6_209 | |
BB6_199: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v27, v10 offset:128 | |
ds_read_b32 v31, v10 | |
v_add_i32_e32 v32, vcc, v11, v23 | |
v_lshlrev_b32_e32 v32, 2, v32 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v31 | |
ds_write_b32 v10, v27 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v27, vcc, s15, v32 | |
ds_read_b32 v31, v27 offset:128 | |
ds_read_b32 v32, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v31, v31, v32 | |
ds_write_b32 v10, v31 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v31, vcc, v11, v19 | |
v_lshlrev_b32_e32 v31, 2, v31 | |
v_add_i32_e32 v31, vcc, s15, v31 | |
ds_read_b32 v32, v31 offset:128 | |
ds_read_b32 v33, v10 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v32, v32, v33 | |
ds_write_b32 v10, v32 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v32, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_201 | |
BB6_200: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v32, 0, -1, vcc | |
BB6_201: ; %Flow1202 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB6_203 | |
s_cbranch_execz BB6_203 | |
BB6_202: ; %.thread85.i459 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v32, v10 offset:64 | |
ds_read_b32 v33, v10 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v32, v32, v33 | |
ds_write_b32 v10, v32 | |
ds_read_b32 v27, v27 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v32, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v32 | |
ds_write_b32 v10, v27 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v27, v31 offset:64 | |
ds_read_b32 v31, v10 offset:512 | |
v_mov_b32_e32 v32, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v31 | |
ds_write_b32 v10, v27 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB6_203: ; %Flow1203 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v32 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_208 | |
s_cbranch_execz BB6_208 | |
BB6_204: | |
v_or_b32_e32 v27, 16, v9 | |
v_add_i32_e32 v27, vcc, v27, v2 | |
v_mul_lo_i32 v31, v27, 3 | |
v_mov_b32_e32 v27, 0xe0 | |
v_mad_i32_i24 v27, v27, v1, v10 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v32, v27 | |
ds_read_b32 v27, v27 offset:32 | |
v_add_i32_e32 v31, vcc, v1, v31 | |
s_mov_b32 s11, 0xf000 | |
s_mov_b32 s10, 0 | |
s_mov_b64 s[8:9], s[28:29] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v32, v27 | |
v_ashrrev_i32_e32 v32, 31, v31 | |
v_lshl_b64 v[33:34], v[31:32], 2 | |
v_add_i32_e32 v31, vcc, s28, v33 | |
v_mov_b32_e32 v32, s29 | |
v_addc_u32_e32 v32, vcc, v34, v32, vcc | |
buffer_load_dword v34, v[33:34], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_mov_b64 s[12:13], s[8:9] | |
s_waitcnt vmcnt(0) | |
BB6_205: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v33, -1 | |
v_add_f32_e32 v33, v27, v34 | |
v_mov_b32_e32 v36, v34 | |
v_mov_b32_e32 v35, v33 | |
buffer_atomic_cmpswap v[35:36], v[31:32], s[8:11], 0 addr64 glc | |
v_mov_b32_e32 v33, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v35, v34 | |
s_or_b64 s[12:13], vcc, s[12:13] | |
v_mov_b32_e32 v34, v35 | |
s_andn2_b64 exec, exec, s[12:13] | |
s_cbranch_execnz BB6_205 | |
; BB#206: ; %atomicAdd_g_f.exit.i447 | |
s_or_b64 exec, exec, s[12:13] | |
s_and_b64 s[8:9], exec, s[2:3] | |
v_cndmask_b32_e64 v31, 0, 1, s[8:9] | |
v_cmp_ne_u32_e32 vcc, 1, v31 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB6_208 | |
; BB#207: | |
v_add_f32_e32 v3, v27, v3 | |
BB6_208: ; %Flow1204 | |
s_or_b64 exec, exec, s[6:7] | |
BB6_209: ; %reduce_force_i_pow2.exit461 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v6, v28 | |
ds_write_b32 v7, v29 | |
ds_write_b32 v12, v30 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB6_220 | |
s_cbranch_execz BB6_220 | |
BB6_210: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v27, v10 offset:128 | |
ds_read_b32 v28, v10 | |
v_add_i32_e32 v29, vcc, v11, v23 | |
v_lshlrev_b32_e32 v29, 2, v29 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v28 | |
ds_write_b32 v10, v27 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v27, vcc, s15, v29 | |
ds_read_b32 v28, v27 offset:128 | |
ds_read_b32 v29, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v28, v28, v29 | |
ds_write_b32 v10, v28 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v28, vcc, v11, v19 | |
v_lshlrev_b32_e32 v28, 2, v28 | |
v_add_i32_e32 v28, vcc, s15, v28 | |
ds_read_b32 v29, v28 offset:128 | |
ds_read_b32 v30, v10 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v29, v29, v30 | |
ds_write_b32 v10, v29 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v29, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_212 | |
BB6_211: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v29, 0, -1, vcc | |
BB6_212: ; %Flow1199 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB6_214 | |
s_cbranch_execz BB6_214 | |
BB6_213: ; %.thread85.i410 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v29, v10 offset:64 | |
ds_read_b32 v30, v10 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v29, v29, v30 | |
ds_write_b32 v10, v29 | |
ds_read_b32 v27, v27 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v29, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v29 | |
ds_write_b32 v10, v27 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v27, v28 offset:64 | |
ds_read_b32 v28, v10 offset:512 | |
v_mov_b32_e32 v29, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v27, v28 | |
ds_write_b32 v10, v27 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB6_214: ; %Flow1200 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v29 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_219 | |
s_cbranch_execz BB6_219 | |
BB6_215: | |
v_or_b32_e32 v27, 24, v9 | |
v_add_i32_e32 v27, vcc, v27, v2 | |
v_mul_lo_i32 v28, v27, 3 | |
v_mov_b32_e32 v27, 0xe0 | |
v_mad_i32_i24 v27, v27, v1, v10 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v29, v27 | |
ds_read_b32 v27, v27 offset:32 | |
v_add_i32_e32 v28, vcc, v1, v28 | |
s_mov_b32 s11, 0xf000 | |
s_mov_b32 s10, 0 | |
s_mov_b64 s[8:9], s[28:29] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v27, v29, v27 | |
v_ashrrev_i32_e32 v29, 31, v28 | |
v_lshl_b64 v[30:31], v[28:29], 2 | |
v_add_i32_e32 v28, vcc, s28, v30 | |
v_mov_b32_e32 v29, s29 | |
v_addc_u32_e32 v29, vcc, v31, v29, vcc | |
buffer_load_dword v31, v[30:31], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_mov_b64 s[12:13], s[8:9] | |
s_waitcnt vmcnt(0) | |
BB6_216: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v30, -1 | |
v_add_f32_e32 v30, v27, v31 | |
v_mov_b32_e32 v33, v31 | |
v_mov_b32_e32 v32, v30 | |
buffer_atomic_cmpswap v[32:33], v[28:29], s[8:11], 0 addr64 glc | |
v_mov_b32_e32 v30, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v32, v31 | |
s_or_b64 s[12:13], vcc, s[12:13] | |
v_mov_b32_e32 v31, v32 | |
s_andn2_b64 exec, exec, s[12:13] | |
s_cbranch_execnz BB6_216 | |
; BB#217: ; %atomicAdd_g_f.exit.i398 | |
s_or_b64 exec, exec, s[12:13] | |
s_and_b64 s[8:9], exec, s[2:3] | |
v_cndmask_b32_e64 v28, 0, 1, s[8:9] | |
v_cmp_ne_u32_e32 vcc, 1, v28 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB6_219 | |
; BB#218: | |
v_add_f32_e32 v3, v27, v3 | |
BB6_219: ; %Flow1201 | |
s_or_b64 exec, exec, s[6:7] | |
BB6_220: ; %reduce_force_i_pow2.exit412 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v6, v24 | |
ds_write_b32 v7, v25 | |
ds_write_b32 v12, v26 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB6_231 | |
s_cbranch_execz BB6_231 | |
BB6_221: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v24, v10 offset:128 | |
ds_read_b32 v25, v10 | |
v_add_i32_e32 v26, vcc, v11, v23 | |
v_lshlrev_b32_e32 v26, 2, v26 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v24, v24, v25 | |
ds_write_b32 v10, v24 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v24, vcc, s15, v26 | |
ds_read_b32 v25, v24 offset:128 | |
ds_read_b32 v26, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v25, v25, v26 | |
ds_write_b32 v10, v25 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v25, vcc, v11, v19 | |
v_lshlrev_b32_e32 v25, 2, v25 | |
v_add_i32_e32 v25, vcc, s15, v25 | |
ds_read_b32 v26, v25 offset:128 | |
ds_read_b32 v27, v10 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v26, v26, v27 | |
ds_write_b32 v10, v26 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v26, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_223 | |
BB6_222: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v26, 0, -1, vcc | |
BB6_223: ; %Flow1196 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB6_225 | |
s_cbranch_execz BB6_225 | |
BB6_224: ; %.thread85.i361 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v26, v10 offset:64 | |
ds_read_b32 v27, v10 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v26, v26, v27 | |
ds_write_b32 v10, v26 | |
ds_read_b32 v24, v24 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v26, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v24, v24, v26 | |
ds_write_b32 v10, v24 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v24, v25 offset:64 | |
ds_read_b32 v25, v10 offset:512 | |
v_mov_b32_e32 v26, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v24, v24, v25 | |
ds_write_b32 v10, v24 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB6_225: ; %Flow1197 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v26 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_230 | |
s_cbranch_execz BB6_230 | |
BB6_226: | |
v_or_b32_e32 v24, 32, v9 | |
v_add_i32_e32 v24, vcc, v24, v2 | |
v_mul_lo_i32 v25, v24, 3 | |
v_mov_b32_e32 v24, 0xe0 | |
v_mad_i32_i24 v24, v24, v1, v10 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v26, v24 | |
ds_read_b32 v24, v24 offset:32 | |
v_add_i32_e32 v25, vcc, v1, v25 | |
s_mov_b32 s11, 0xf000 | |
s_mov_b32 s10, 0 | |
s_mov_b64 s[8:9], s[28:29] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v24, v26, v24 | |
v_ashrrev_i32_e32 v26, 31, v25 | |
v_lshl_b64 v[27:28], v[25:26], 2 | |
v_add_i32_e32 v25, vcc, s28, v27 | |
v_mov_b32_e32 v26, s29 | |
v_addc_u32_e32 v26, vcc, v28, v26, vcc | |
buffer_load_dword v28, v[27:28], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_mov_b64 s[12:13], s[8:9] | |
s_waitcnt vmcnt(0) | |
BB6_227: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v27, -1 | |
v_add_f32_e32 v27, v24, v28 | |
v_mov_b32_e32 v30, v28 | |
v_mov_b32_e32 v29, v27 | |
buffer_atomic_cmpswap v[29:30], v[25:26], s[8:11], 0 addr64 glc | |
v_mov_b32_e32 v27, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v29, v28 | |
s_or_b64 s[12:13], vcc, s[12:13] | |
v_mov_b32_e32 v28, v29 | |
s_andn2_b64 exec, exec, s[12:13] | |
s_cbranch_execnz BB6_227 | |
; BB#228: ; %atomicAdd_g_f.exit.i349 | |
s_or_b64 exec, exec, s[12:13] | |
s_and_b64 s[8:9], exec, s[2:3] | |
v_cndmask_b32_e64 v25, 0, 1, s[8:9] | |
v_cmp_ne_u32_e32 vcc, 1, v25 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB6_230 | |
; BB#229: | |
v_add_f32_e32 v3, v24, v3 | |
BB6_230: ; %Flow1198 | |
s_or_b64 exec, exec, s[6:7] | |
BB6_231: ; %reduce_force_i_pow2.exit363 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v6, v20 | |
ds_write_b32 v7, v21 | |
ds_write_b32 v12, v22 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB6_242 | |
s_cbranch_execz BB6_242 | |
BB6_232: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v20, v10 offset:128 | |
ds_read_b32 v21, v10 | |
v_add_i32_e32 v22, vcc, v11, v23 | |
v_lshlrev_b32_e32 v22, 2, v22 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v20, v20, v21 | |
ds_write_b32 v10, v20 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v20, vcc, s15, v22 | |
ds_read_b32 v21, v20 offset:128 | |
ds_read_b32 v22, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v21, v21, v22 | |
ds_write_b32 v10, v21 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v21, vcc, v11, v19 | |
v_lshlrev_b32_e32 v21, 2, v21 | |
v_add_i32_e32 v21, vcc, s15, v21 | |
ds_read_b32 v22, v21 offset:128 | |
ds_read_b32 v24, v10 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v22, v22, v24 | |
ds_write_b32 v10, v22 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v22, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_234 | |
BB6_233: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v22, 0, -1, vcc | |
BB6_234: ; %Flow1193 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB6_236 | |
s_cbranch_execz BB6_236 | |
BB6_235: ; %.thread85.i312 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v22, v10 offset:64 | |
ds_read_b32 v24, v10 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v22, v22, v24 | |
ds_write_b32 v10, v22 | |
ds_read_b32 v20, v20 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v22, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v20, v20, v22 | |
ds_write_b32 v10, v20 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v20, v21 offset:64 | |
ds_read_b32 v21, v10 offset:512 | |
v_mov_b32_e32 v22, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v20, v20, v21 | |
ds_write_b32 v10, v20 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB6_236: ; %Flow1194 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v22 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_241 | |
s_cbranch_execz BB6_241 | |
BB6_237: | |
v_or_b32_e32 v20, 40, v9 | |
v_add_i32_e32 v20, vcc, v20, v2 | |
v_mul_lo_i32 v21, v20, 3 | |
v_mov_b32_e32 v20, 0xe0 | |
v_mad_i32_i24 v20, v20, v1, v10 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v22, v20 | |
ds_read_b32 v20, v20 offset:32 | |
v_add_i32_e32 v21, vcc, v1, v21 | |
v_mov_b32_e32 v25, s29 | |
s_mov_b32 s11, 0xf000 | |
s_mov_b32 s10, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v20, v22, v20 | |
v_ashrrev_i32_e32 v22, 31, v21 | |
v_lshl_b64 v[21:22], v[21:22], 2 | |
v_add_i32_e32 v24, vcc, s28, v21 | |
s_mov_b64 s[8:9], s[28:29] | |
v_addc_u32_e32 v25, vcc, v22, v25, vcc | |
buffer_load_dword v22, v[21:22], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_mov_b64 s[12:13], s[8:9] | |
s_waitcnt vmcnt(0) | |
BB6_238: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v21, -1 | |
v_add_f32_e32 v21, v20, v22 | |
v_mov_b32_e32 v27, v22 | |
v_mov_b32_e32 v26, v21 | |
buffer_atomic_cmpswap v[26:27], v[24:25], s[8:11], 0 addr64 glc | |
v_mov_b32_e32 v21, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v26, v22 | |
s_or_b64 s[12:13], vcc, s[12:13] | |
v_mov_b32_e32 v22, v26 | |
s_andn2_b64 exec, exec, s[12:13] | |
s_cbranch_execnz BB6_238 | |
; BB#239: ; %atomicAdd_g_f.exit.i300 | |
s_or_b64 exec, exec, s[12:13] | |
s_and_b64 s[8:9], exec, s[2:3] | |
v_cndmask_b32_e64 v21, 0, 1, s[8:9] | |
v_cmp_ne_u32_e32 vcc, 1, v21 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB6_241 | |
; BB#240: | |
v_add_f32_e32 v3, v20, v3 | |
BB6_241: ; %Flow1195 | |
s_or_b64 exec, exec, s[6:7] | |
BB6_242: ; %reduce_force_i_pow2.exit314 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v6, v16 | |
ds_write_b32 v7, v17 | |
ds_write_b32 v12, v18 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB6_253 | |
s_cbranch_execz BB6_253 | |
BB6_243: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v16, v10 offset:128 | |
ds_read_b32 v17, v10 | |
v_add_i32_e32 v18, vcc, v11, v23 | |
v_lshlrev_b32_e32 v18, 2, v18 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v16, v16, v17 | |
ds_write_b32 v10, v16 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v16, vcc, s15, v18 | |
ds_read_b32 v17, v16 offset:128 | |
ds_read_b32 v18, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v17, v17, v18 | |
ds_write_b32 v10, v17 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v17, vcc, v11, v19 | |
v_lshlrev_b32_e32 v17, 2, v17 | |
v_add_i32_e32 v17, vcc, s15, v17 | |
ds_read_b32 v18, v17 offset:128 | |
ds_read_b32 v20, v10 offset:512 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v18, v18, v20 | |
ds_write_b32 v10, v18 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v18, 0 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_245 | |
BB6_244: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v18, 0, -1, vcc | |
BB6_245: ; %Flow1190 | |
s_or_saveexec_b64 s[6:7], s[6:7] | |
s_xor_b64 exec, exec, s[6:7] | |
; mask branch BB6_247 | |
s_cbranch_execz BB6_247 | |
BB6_246: ; %.thread85.i263 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v18, v10 offset:64 | |
ds_read_b32 v20, v10 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v18, v18, v20 | |
ds_write_b32 v10, v18 | |
ds_read_b32 v16, v16 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v18, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v16, v16, v18 | |
ds_write_b32 v10, v16 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v16, v17 offset:64 | |
ds_read_b32 v17, v10 offset:512 | |
v_mov_b32_e32 v18, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v16, v16, v17 | |
ds_write_b32 v10, v16 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB6_247: ; %Flow1191 | |
s_or_b64 exec, exec, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 0, v18 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
; mask branch BB6_252 | |
s_cbranch_execz BB6_252 | |
BB6_248: | |
v_or_b32_e32 v16, 48, v9 | |
v_add_i32_e32 v16, vcc, v16, v2 | |
v_mul_lo_i32 v17, v16, 3 | |
v_mov_b32_e32 v16, 0xe0 | |
v_mad_i32_i24 v16, v16, v1, v10 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v18, v16 | |
ds_read_b32 v16, v16 offset:32 | |
v_add_i32_e32 v17, vcc, v1, v17 | |
v_mov_b32_e32 v20, s29 | |
s_mov_b32 s11, 0xf000 | |
s_mov_b32 s10, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v16, v18, v16 | |
v_ashrrev_i32_e32 v18, 31, v17 | |
v_lshl_b64 v[17:18], v[17:18], 2 | |
v_add_i32_e32 v24, vcc, s28, v17 | |
s_mov_b64 s[8:9], s[28:29] | |
v_addc_u32_e32 v25, vcc, v18, v20, vcc | |
buffer_load_dword v18, v[17:18], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_mov_b64 s[12:13], s[8:9] | |
s_waitcnt vmcnt(0) | |
BB6_249: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v17, -1 | |
v_add_f32_e32 v17, v16, v18 | |
v_mov_b32_e32 v21, v18 | |
v_mov_b32_e32 v20, v17 | |
buffer_atomic_cmpswap v[20:21], v[24:25], s[8:11], 0 addr64 glc | |
v_mov_b32_e32 v17, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v20, v18 | |
s_or_b64 s[12:13], vcc, s[12:13] | |
v_mov_b32_e32 v18, v20 | |
s_andn2_b64 exec, exec, s[12:13] | |
s_cbranch_execnz BB6_249 | |
; BB#250: ; %atomicAdd_g_f.exit.i251 | |
s_or_b64 exec, exec, s[12:13] | |
s_and_b64 s[8:9], exec, s[2:3] | |
v_cndmask_b32_e64 v17, 0, 1, s[8:9] | |
v_cmp_ne_u32_e32 vcc, 1, v17 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB6_252 | |
; BB#251: | |
v_add_f32_e32 v3, v16, v3 | |
BB6_252: ; %Flow1192 | |
s_or_b64 exec, exec, s[6:7] | |
BB6_253: ; %reduce_force_i_pow2.exit265 | |
s_or_b64 exec, exec, s[4:5] | |
s_mov_b32 m0, -1 | |
s_barrier | |
ds_write_b32 v6, v13 | |
ds_write_b32 v7, v14 | |
ds_write_b32 v12, v15 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_and_saveexec_b64 s[4:5], s[0:1] | |
s_xor_b64 s[0:1], exec, s[4:5] | |
; mask branch BB6_264 | |
s_cbranch_execz BB6_264 | |
BB6_254: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v12, v10 offset:128 | |
ds_read_b32 v13, v10 | |
v_add_i32_e32 v14, vcc, v11, v23 | |
v_lshlrev_b32_e32 v14, 2, v14 | |
v_add_i32_e32 v11, vcc, v11, v19 | |
v_lshlrev_b32_e32 v11, 2, v11 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v12, v12, v13 | |
ds_write_b32 v10, v12 | |
s_waitcnt lgkmcnt(0) | |
v_add_i32_e32 v12, vcc, s15, v14 | |
ds_read_b32 v13, v12 offset:128 | |
ds_read_b32 v14, v10 offset:256 | |
v_add_i32_e32 v11, vcc, s15, v11 | |
v_cmp_lt_i32_e32 vcc, 1, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v13, v13, v14 | |
ds_write_b32 v10, v13 offset:256 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v13, v11 offset:128 | |
ds_read_b32 v14, v10 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v13, v13, v14 | |
ds_write_b32 v10, v13 offset:512 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v13, 0 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB6_256 | |
BB6_255: | |
v_cmp_eq_u32_e32 vcc, 2, v1 | |
v_cndmask_b32_e64 v13, 0, -1, vcc | |
BB6_256: ; %Flow1187 | |
s_or_saveexec_b64 s[4:5], s[4:5] | |
s_xor_b64 exec, exec, s[4:5] | |
; mask branch BB6_258 | |
s_cbranch_execz BB6_258 | |
BB6_257: ; %.thread85.i214 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v13, v10 offset:64 | |
ds_read_b32 v14, v10 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v13, v13, v14 | |
ds_write_b32 v10, v13 | |
ds_read_b32 v12, v12 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v13, v10 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v12, v12, v13 | |
ds_write_b32 v10, v12 offset:256 | |
ds_read_b32 v11, v11 offset:64 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v12, v10 offset:512 | |
v_mov_b32_e32 v13, -1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v11, v11, v12 | |
ds_write_b32 v10, v11 offset:512 | |
s_waitcnt lgkmcnt(0) | |
BB6_258: ; %Flow1188 | |
s_or_b64 exec, exec, s[4:5] | |
v_cmp_ne_u32_e32 vcc, 0, v13 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
; mask branch BB6_263 | |
s_cbranch_execz BB6_263 | |
BB6_259: | |
v_or_b32_e32 v9, 56, v9 | |
v_add_i32_e32 v2, vcc, v9, v2 | |
v_mul_lo_i32 v9, v2, 3 | |
v_mov_b32_e32 v2, 0xe0 | |
v_mad_i32_i24 v2, v2, v1, v10 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v10, v2 | |
ds_read_b32 v2, v2 offset:32 | |
v_add_i32_e32 v9, vcc, v1, v9 | |
s_mov_b32 s31, 0xf000 | |
s_mov_b32 s30, 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v2, v10, v2 | |
v_ashrrev_i32_e32 v10, 31, v9 | |
v_lshl_b64 v[11:12], v[9:10], 2 | |
v_add_i32_e32 v9, vcc, s28, v11 | |
v_mov_b32_e32 v10, s29 | |
v_addc_u32_e32 v10, vcc, v12, v10, vcc | |
buffer_load_dword v12, v[11:12], s[28:31], 0 addr64 | |
s_mov_b64 s[28:29], 0 | |
s_mov_b64 s[6:7], s[28:29] | |
s_waitcnt vmcnt(0) | |
BB6_260: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v11, -1 | |
v_add_f32_e32 v11, v2, v12 | |
v_mov_b32_e32 v14, v12 | |
v_mov_b32_e32 v13, v11 | |
buffer_atomic_cmpswap v[13:14], v[9:10], s[28:31], 0 addr64 glc | |
v_mov_b32_e32 v11, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v13, v12 | |
s_or_b64 s[6:7], vcc, s[6:7] | |
v_mov_b32_e32 v12, v13 | |
s_andn2_b64 exec, exec, s[6:7] | |
s_cbranch_execnz BB6_260 | |
; BB#261: ; %atomicAdd_g_f.exit.i202 | |
s_or_b64 exec, exec, s[6:7] | |
s_and_b64 s[6:7], exec, s[2:3] | |
v_cndmask_b32_e64 v9, 0, 1, s[6:7] | |
v_cmp_ne_u32_e32 vcc, 1, v9 | |
s_and_b64 vcc, exec, vcc | |
s_cbranch_vccnz BB6_263 | |
; BB#262: | |
v_add_f32_e32 v3, v2, v3 | |
BB6_263: ; %Flow1189 | |
s_or_b64 exec, exec, s[4:5] | |
BB6_264: ; %reduce_force_i_pow2.exit216 | |
s_or_b64 exec, exec, s[0:1] | |
s_barrier | |
v_cmp_gt_u32_e32 vcc, 3, v1 | |
s_and_b64 s[0:1], exec, s[2:3] | |
s_and_b64 s[0:1], vcc, s[0:1] | |
s_and_saveexec_b64 s[2:3], s[0:1] | |
s_xor_b64 s[0:1], exec, s[2:3] | |
; mask branch BB6_268 | |
s_cbranch_execz BB6_268 | |
BB6_265: | |
v_add_i32_e32 v1, vcc, v4, v1 | |
v_mov_b32_e32 v2, 0 | |
v_lshl_b64 v[1:2], v[1:2], 2 | |
v_add_i32_e32 v9, vcc, s24, v1 | |
v_mov_b32_e32 v4, s25 | |
s_mov_b32 s27, 0xf000 | |
s_mov_b32 s26, 0 | |
v_addc_u32_e32 v10, vcc, v2, v4, vcc | |
buffer_load_dword v2, v[1:2], s[24:27], 0 addr64 | |
s_mov_b64 s[24:25], 0 | |
s_mov_b64 s[2:3], s[24:25] | |
s_waitcnt vmcnt(0) | |
BB6_266: ; =>This Inner Loop Header: Depth=1 | |
v_mov_b32_e32 v1, -1 | |
v_add_f32_e32 v1, v3, v2 | |
v_mov_b32_e32 v12, v2 | |
v_mov_b32_e32 v11, v1 | |
buffer_atomic_cmpswap v[11:12], v[9:10], s[24:27], 0 addr64 glc | |
v_mov_b32_e32 v1, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v11, v2 | |
s_or_b64 s[2:3], vcc, s[2:3] | |
v_mov_b32_e32 v2, v11 | |
s_andn2_b64 exec, exec, s[2:3] | |
s_cbranch_execnz BB6_266 | |
; BB#267: ; %Flow1185 | |
s_or_b64 exec, exec, s[2:3] | |
BB6_268: ; %Flow1186 | |
s_or_b64 exec, exec, s[0:1] | |
v_and_b32_e32 v1, 0x7ffffdf, v0 | |
s_mov_b32 m0, -1 | |
v_cmp_gt_u32_e32 vcc, 16, v1 | |
ds_write_b32 v6, v8 | |
ds_write_b32 v7, v5 | |
s_and_saveexec_b64 s[0:1], vcc | |
s_xor_b64 s[0:1], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_282 | |
s_cbranch_execz BB6_282 | |
BB6_269: | |
v_and_b32_e32 v0, 32, v0 | |
v_lshlrev_b32_e32 v0, 2, v0 | |
v_add_i32_e32 v0, vcc, s14, v0 | |
v_add_i32_e32 v0, vcc, 0x620, v0 | |
v_lshlrev_b32_e32 v2, 2, v1 | |
v_add_i32_e32 v2, vcc, v2, v0 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v3, v2 offset:64 | |
ds_read_b32 v4, v2 | |
v_cmp_gt_u32_e32 vcc, 8, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v4 | |
ds_write_b32 v2, v3 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v3, v2 offset:320 | |
ds_read_b32 v4, v2 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v4 | |
ds_write_b32 v2, v3 offset:256 | |
s_and_saveexec_b64 s[2:3], vcc | |
s_xor_b64 s[2:3], exec, s[2:3] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_281 | |
s_cbranch_execz BB6_281 | |
BB6_270: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v3, v2 offset:32 | |
ds_read_b32 v4, v2 | |
v_cmp_gt_u32_e32 vcc, 4, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v4 | |
ds_write_b32 v2, v3 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v3, v2 offset:288 | |
ds_read_b32 v4, v2 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v4 | |
ds_write_b32 v2, v3 offset:256 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_280 | |
s_cbranch_execz BB6_280 | |
BB6_271: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v3, v2 offset:16 | |
ds_read_b32 v4, v2 | |
v_cmp_gt_u32_e32 vcc, 2, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v4 | |
ds_write_b32 v2, v3 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v3, v2 offset:272 | |
ds_read_b32 v4, v2 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v4 | |
ds_write_b32 v2, v3 offset:256 | |
s_and_saveexec_b64 s[6:7], vcc | |
s_xor_b64 s[6:7], exec, s[6:7] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_279 | |
s_cbranch_execz BB6_279 | |
BB6_272: | |
s_mov_b32 m0, -1 | |
ds_read_b32 v3, v2 offset:8 | |
ds_read_b32 v4, v2 | |
v_cmp_eq_u32_e32 vcc, 0, v1 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v4 | |
ds_write_b32 v2, v3 | |
s_waitcnt lgkmcnt(0) | |
ds_read_b32 v3, v2 offset:264 | |
ds_read_b32 v4, v2 offset:256 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v3, v3, v4 | |
ds_write_b32 v2, v3 offset:256 | |
s_and_saveexec_b64 s[8:9], vcc | |
s_xor_b64 s[8:9], exec, s[8:9] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB6_278 | |
s_cbranch_execz BB6_278 | |
BB6_273: | |
s_mov_b32 m0, -1 | |
s_mov_b32 s23, 0xf000 | |
s_mov_b32 s22, -1 | |
ds_read_b32 v2, v0 | |
ds_read_b32 v3, v0 offset:4 | |
ds_read_b32 v1, v0 offset:256 | |
ds_read_b32 v0, v0 offset:260 | |
buffer_load_dword v4, off, s[20:23], 0 | |
s_mov_b64 s[10:11], 0 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v2, v2, v3 | |
s_waitcnt vmcnt(0) | |
BB6_274: ; =>This Inner Loop Header: Depth=1 | |
v_add_f32_e32 v3, v4, v2 | |
v_mov_b32_e32 v6, v4 | |
v_mov_b32_e32 v5, v3 | |
buffer_atomic_cmpswap v[5:6], off, s[20:23], 0 glc | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v5, v4 | |
s_or_b64 s[10:11], vcc, s[10:11] | |
v_mov_b32_e32 v4, v5 | |
s_andn2_b64 exec, exec, s[10:11] | |
s_cbranch_execnz BB6_274 | |
; BB#275: ; %atomicAdd_g_f.exit.i161 | |
s_or_b64 exec, exec, s[10:11] | |
s_mov_b32 s19, 0xf000 | |
s_mov_b32 s18, -1 | |
buffer_load_dword v2, off, s[16:19], 0 | |
v_add_f32_e32 v0, v1, v0 | |
s_mov_b64 s[10:11], 0 | |
s_waitcnt vmcnt(0) | |
BB6_276: ; =>This Inner Loop Header: Depth=1 | |
v_add_f32_e32 v1, v2, v0 | |
v_mov_b32_e32 v4, v2 | |
v_mov_b32_e32 v3, v1 | |
buffer_atomic_cmpswap v[3:4], off, s[16:19], 0 glc | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v3, v2 | |
s_or_b64 s[10:11], vcc, s[10:11] | |
v_mov_b32_e32 v2, v3 | |
s_andn2_b64 exec, exec, s[10:11] | |
s_cbranch_execnz BB6_276 | |
; BB#277: ; %Flow | |
s_or_b64 exec, exec, s[10:11] | |
BB6_278: ; %Flow1181 | |
s_or_b64 exec, exec, s[8:9] | |
BB6_279: ; %Flow1182 | |
s_or_b64 exec, exec, s[6:7] | |
BB6_280: ; %Flow1183 | |
s_or_b64 exec, exec, s[4:5] | |
BB6_281: ; %Flow1184 | |
s_or_b64 exec, exec, s[2:3] | |
BB6_282: ; %reduce_energy_pow2.exit | |
s_or_b64 exec, exec, s[0:1] | |
s_endpgm | |
.Lfunc_end6: | |
.size nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl, .Lfunc_end6-nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl | |
.section .AMDGPU.csdata | |
; Kernel info: | |
; codeLenInByte = 62628 | |
; NumSgprs: 58 | |
; NumVgprs: 116 | |
; FloatMode: 192 | |
; IeeeMode: 1 | |
; ScratchSize: 0 | |
; LDSByteSize: 0 bytes/workgroup (compile time only) | |
; SGPRBlocks: 7 | |
; VGPRBlocks: 28 | |
; NumSGPRsForWavesPerEU: 58 | |
; NumVGPRsForWavesPerEU: 116 | |
; ReservedVGPRFirst: 0 | |
; ReservedVGPRCount: 0 | |
; COMPUTE_PGM_RSRC2:USER_SGPR: 8 | |
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 | |
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 | |
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 | |
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 1 | |
.section .AMDGPU.config | |
.long 47176 | |
.long 11272669 | |
.long 47180 | |
.long 2192 | |
.long 47200 | |
.long 0 | |
.long 4 | |
.long 0 | |
.long 8 | |
.long 0 | |
.text | |
.globl nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl | |
.p2align 8 | |
.type nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl,@function | |
.amdgpu_hsa_kernel nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl | |
nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl: ; @nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl | |
.amd_kernel_code_t | |
amd_code_version_major = 1 | |
amd_code_version_minor = 0 | |
amd_machine_kind = 1 | |
amd_machine_version_major = 7 | |
amd_machine_version_minor = 0 | |
amd_machine_version_stepping = 1 | |
kernel_code_entry_byte_offset = 256 | |
kernel_code_prefetch_byte_size = 0 | |
max_scratch_backing_memory_byte_size = 0 | |
granulated_workitem_vgpr_count = 29 | |
granulated_wavefront_sgpr_count = 7 | |
priority = 0 | |
float_mode = 192 | |
priv = 0 | |
enable_dx10_clamp = 1 | |
debug_mode = 0 | |
enable_ieee_mode = 1 | |
enable_sgpr_private_segment_wave_byte_offset = 0 | |
user_sgpr_count = 8 | |
enable_sgpr_workgroup_id_x = 1 | |
enable_sgpr_workgroup_id_y = 0 | |
enable_sgpr_workgroup_id_z = 0 | |
enable_sgpr_workgroup_info = 0 | |
enable_vgpr_workitem_id = 1 | |
enable_exception_msb = 0 | |
granulated_lds_size = 0 | |
enable_exception = 0 | |
enable_sgpr_private_segment_buffer = 1 | |
enable_sgpr_dispatch_ptr = 1 | |
enable_sgpr_queue_ptr = 0 | |
enable_sgpr_kernarg_segment_ptr = 1 | |
enable_sgpr_dispatch_id = 0 | |
enable_sgpr_flat_scratch_init = 0 | |
enable_sgpr_private_segment_size = 0 | |
enable_sgpr_grid_workgroup_count_x = 0 | |
enable_sgpr_grid_workgroup_count_y = 0 | |
enable_sgpr_grid_workgroup_count_z = 0 | |
enable_ordered_append_gds = 0 | |
private_element_size = 1 | |
is_ptr64 = 1 | |
is_dynamic_callstack = 0 | |
is_debug_enabled = 0 | |
is_xnack_enabled = 0 | |
workitem_private_segment_byte_size = 0 | |
workgroup_group_segment_byte_size = 0 | |
gds_segment_byte_size = 0 | |
kernarg_segment_byte_size = 232 | |
workgroup_fbarrier_count = 0 | |
wavefront_sgpr_count = 60 | |
workitem_vgpr_count = 117 | |
reserved_vgpr_first = 0 | |
reserved_vgpr_count = 0 | |
reserved_sgpr_first = 0 | |
reserved_sgpr_count = 0 | |
debug_wavefront_private_segment_offset_sgpr = 0 | |
debug_private_segment_buffer_sgpr = 0 | |
kernarg_segment_alignment = 4 | |
group_segment_alignment = 4 | |
private_segment_alignment = 4 | |
wavefront_size = 6 | |
call_convention = 0 | |
runtime_loader_kernel_symbol = 0 | |
.end_amd_kernel_code_t | |
; BB#0: | |
s_load_dwordx2 s[20:21], s[6:7], 0x2c | |
s_mov_b32 s9, 0 | |
s_lshl_b64 s[0:1], s[8:9], 4 | |
v_mov_b32_e32 v4, s1 | |
v_mov_b32_e32 v3, s0 | |
s_mov_b32 s23, 0xf000 | |
s_mov_b32 s22, s9 | |
s_waitcnt lgkmcnt(0) | |
buffer_load_dwordx4 v[36:39], v[3:4], s[20:23], 0 addr64 | |
v_mov_b32_e32 v2, v0 | |
s_load_dwordx2 s[0:1], s[6:7], 0x24 | |
s_load_dwordx2 s[32:33], s[6:7], 0x18 | |
s_mov_b64 s[34:35], s[22:23] | |
s_mov_b64 s[2:3], s[22:23] | |
s_load_dword s14, s[6:7], 0x33 | |
s_load_dwordx2 s[36:37], s[6:7], 0x22 | |
s_mov_b32 m0, -1 | |
s_mov_b64 s[38:39], s[22:23] | |
s_load_dword s18, s[6:7], 0x5 | |
s_waitcnt vmcnt(0) | |
v_lshlrev_b32_e32 v41, 3, v36 | |
v_mul_lo_i32 v4, v37, 3 | |
v_add_i32_e32 v0, vcc, v1, v41 | |
v_lshlrev_b32_e32 v0, 3, v0 | |
v_add_i32_e32 v9, vcc, v2, v0 | |
v_ashrrev_i32_e32 v10, 31, v9 | |
v_ashrrev_i32_e32 v5, 31, v4 | |
v_lshl_b64 v[11:12], v[4:5], 2 | |
v_lshl_b64 v[6:7], v[9:10], 4 | |
s_waitcnt lgkmcnt(0) | |
buffer_load_dwordx4 v[5:8], v[6:7], s[32:35], 0 addr64 | |
buffer_load_dwordx2 v[13:14], v[11:12], s[0:3], 0 addr64 | |
buffer_load_dword v0, v[11:12], s[0:3], 0 addr64 offset:8 | |
s_load_dword s2, s[6:7], 0x2 | |
v_lshlrev_b32_e32 v11, 3, v1 | |
v_add_i32_e32 v40, vcc, v2, v11 | |
s_load_dword s0, s[4:5], 0x1 | |
s_add_i32 s4, s14, 0x420 | |
s_waitcnt lgkmcnt(0) | |
s_and_b32 s0, s0, 0xffff | |
s_waitcnt vmcnt(1) | |
v_add_f32_e32 v15, v6, v14 | |
v_add_f32_e32 v14, v5, v13 | |
s_waitcnt vmcnt(0) | |
v_add_f32_e32 v5, v7, v0 | |
v_lshlrev_b32_e32 v0, 4, v40 | |
v_add_i32_e32 v3, vcc, s14, v0 | |
v_mul_f32_e32 v6, s2, v8 | |
ds_write2_b64 v3, v[14:15], v[5:6] offset1:1 | |
s_waitcnt lgkmcnt(0) | |
v_lshl_b64 v[5:6], v[9:10], 3 | |
buffer_load_dwordx2 v[5:6], v[5:6], s[36:39], 0 addr64 | |
v_mad_u32_u24 v0, s0, v1, v2 | |
v_lshlrev_b32_e32 v7, 3, v40 | |
v_add_i32_e32 v7, vcc, s4, v7 | |
v_or_b32_e32 v3, 32, v0 | |
v_lshrrev_b32_e32 v46, 5, v0 | |
v_cmp_eq_u32_e32 vcc, 32, v3 | |
s_waitcnt vmcnt(0) | |
ds_write_b64 v7, v[5:6] | |
s_and_saveexec_b64 s[0:1], vcc | |
s_xor_b64 s[0:1], exec, s[0:1] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB7_2 | |
BB7_1: | |
v_lshlrev_b32_e32 v3, 2, v46 | |
v_add_i32_e32 v3, vcc, s14, v3 | |
v_mov_b32_e32 v5, 0 | |
s_mov_b32 m0, -1 | |
ds_write_b32 v3, v5 offset:2336 | |
s_waitcnt lgkmcnt(0) | |
BB7_2: | |
s_or_b64 exec, exec, s[0:1] | |
s_barrier | |
s_load_dwordx2 s[40:41], s[6:7], 0x2e | |
v_cmp_ne_u32_e32 vcc, 22, v37 | |
s_and_b64 vcc, exec, vcc | |
v_mov_b32_e32 v3, 0 | |
v_cmp_eq_u32_e64 s[0:1], 22, v37 | |
s_waitcnt lgkmcnt(0) | |
s_mov_b64 vcc, vcc | |
s_cbranch_vccnz BB7_5 | |
; BB#3: | |
v_ashrrev_i32_e32 v6, 31, v38 | |
v_mov_b32_e32 v5, v38 | |
s_mov_b32 s43, 0xf000 | |
s_mov_b32 s42, 0 | |
v_lshl_b64 v[5:6], v[5:6], 5 | |
buffer_load_dword v5, v[5:6], s[40:43], 0 addr64 | |
s_waitcnt vmcnt(0) | |
v_cmp_ne_u32_e32 vcc, v5, v41 | |
s_and_b64 vcc, exec, vcc | |
v_mov_b32_e32 v5, v3 | |
s_cbranch_vccnz BB7_6 | |
; BB#4: ; %.preheader561.preheader | |
v_lshlrev_b32_e32 v5, 4, v2 | |
v_add_i32_e32 v9, vcc, s14, v5 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[5:8], v9 offset0:1 offset1:17 | |
s_waitcnt lgkmcnt(0) | |
v_mov_b32_e32 v5, 0x41000000 | |
ds_read2_b64 v[12:15], v9 offset0:33 offset1:49 | |
v_mul_f32_e32 v5, s2, v5 | |
ds_read2_b64 v[16:19], v9 offset0:65 offset1:81 | |
v_mad_f32 v10, v6, v6, 0 | |
v_mov_b32_e32 v6, 0x6f800000 | |
v_cmp_lt_f32_e64 vcc, v6, |v5| | |
v_mov_b32_e32 v6, 0x2f800000 | |
s_waitcnt lgkmcnt(0) | |
v_cndmask_b32_e32 v12, 1.0, v6, vcc | |
v_mac_f32_e32 v10, v8, v8 | |
v_mul_f32_e32 v5, v12, v5 | |
v_mac_f32_e32 v10, v13, v13 | |
v_rcp_f32_e32 v13, v5 | |
ds_read2_b64 v[5:8], v9 offset0:97 offset1:113 | |
v_mac_f32_e32 v10, v15, v15 | |
v_mac_f32_e32 v10, v17, v17 | |
v_mac_f32_e32 v10, v19, v19 | |
s_waitcnt lgkmcnt(0) | |
v_mac_f32_e32 v10, v6, v6 | |
v_mac_f32_e32 v10, v8, v8 | |
v_mul_f32_e32 v5, v13, v10 | |
v_mov_b32_e32 v6, 0xbf106ebb | |
v_mul_f32_e32 v5, v5, v12 | |
v_mul_f32_e32 v6, s18, v6 | |
v_mul_f32_e32 v5, v5, v6 | |
s_branch BB7_6 | |
BB7_5: | |
v_mov_b32_e32 v5, v3 | |
BB7_6: ; %.preheader560 | |
s_load_dwordx2 s[28:29], s[6:7], 0x1a | |
v_cmp_lt_i32_e32 vcc, v38, v39 | |
v_mov_b32_e32 v17, -1 | |
s_and_b64 vcc, exec, vcc | |
s_waitcnt lgkmcnt(0) | |
s_mov_b64 vcc, vcc | |
s_cbranch_vccnz BB7_8 | |
; BB#7: ; %.preheader560.._crit_edge_crit_edge | |
v_mov_b32_e32 v8, 0 | |
v_lshlrev_b32_e32 v6, 2, v0 | |
v_mov_b32_e32 v9, v8 | |
v_mov_b32_e32 v10, v8 | |
v_add_i32_e32 v7, vcc, s14, v6 | |
v_mov_b32_e32 v51, v11 | |
v_mov_b32_e32 v16, v11 | |
v_add_i32_e32 v6, vcc, 0x620, v7 | |
v_add_i32_e32 v12, vcc, 0x820, v7 | |
v_add_i32_e32 v7, vcc, 0x720, v7 | |
v_mov_b32_e32 v17, 0 | |
v_mov_b32_e32 v50, v10 | |
v_mov_b32_e32 v49, v9 | |
v_mov_b32_e32 v48, v8 | |
v_mov_b32_e32 v15, v10 | |
v_mov_b32_e32 v14, v9 | |
v_mov_b32_e32 v13, v8 | |
s_branch BB7_9 | |
BB7_8: | |
; implicit-def: %VGPR8 | |
; implicit-def: %VGPR48_VGPR49_VGPR50_VGPR51 | |
; implicit-def: %VGPR6 | |
; implicit-def: %VGPR12 | |
; implicit-def: %VGPR13_VGPR14_VGPR15_VGPR16 | |
; implicit-def: %VGPR7 | |
BB7_9: ; %Flow1256 | |
s_load_dwordx2 s[24:25], s[6:7], 0x20 | |
s_load_dwordx2 s[20:21], s[6:7], 0x1c | |
s_load_dwordx2 s[16:17], s[6:7], 0x1e | |
v_cmp_ne_u32_e32 vcc, 0, v17 | |
v_cndmask_b32_e64 v9, 0, 1, vcc | |
v_mov_b32_e32 v42, v48 | |
v_mov_b32_e32 v32, v48 | |
v_mov_b32_e32 v28, v48 | |
v_mov_b32_e32 v24, v48 | |
v_mov_b32_e32 v20, v48 | |
v_mov_b32_e32 v16, v48 | |
v_cmp_ne_u32_e32 vcc, 1, v9 | |
s_movk_i32 s5, 0x620 | |
s_add_i32 s15, s14, s5 | |
s_and_b64 vcc, exec, vcc | |
v_mov_b32_e32 v43, v49 | |
v_mov_b32_e32 v44, v50 | |
v_mov_b32_e32 v45, v51 | |
v_mov_b32_e32 v33, v49 | |
v_mov_b32_e32 v34, v50 | |
v_mov_b32_e32 v35, v51 | |
v_mov_b32_e32 v29, v49 | |
v_mov_b32_e32 v30, v50 | |
v_mov_b32_e32 v31, v51 | |
v_mov_b32_e32 v25, v49 | |
v_mov_b32_e32 v26, v50 | |
v_mov_b32_e32 v27, v51 | |
v_mov_b32_e32 v21, v49 | |
v_mov_b32_e32 v22, v50 | |
v_mov_b32_e32 v23, v51 | |
v_mov_b32_e32 v17, v49 | |
v_mov_b32_e32 v18, v50 | |
v_mov_b32_e32 v19, v51 | |
s_waitcnt lgkmcnt(0) | |
s_mov_b64 vcc, vcc | |
s_cbranch_vccnz BB7_176 | |
; BB#10: ; %.lr.ph | |
v_or_b32_e32 v6, 4, v1 | |
v_mov_b32_e32 v13, 0 | |
v_cmp_eq_u32_e32 vcc, 4, v6 | |
v_cmp_gt_u32_e64 s[2:3], 4, v2 | |
s_and_b64 s[48:49], s[2:3], vcc | |
v_add_i32_e32 v6, vcc, v1, v2 | |
v_and_b32_e32 v8, 4, v1 | |
s_load_dwordx2 s[22:23], s[6:7], 0x30 | |
s_load_dword s19, s[6:7], 0x6 | |
s_load_dword s26, s[6:7], 0x9 | |
s_load_dword s27, s[6:7], 0xa | |
s_load_dword s42, s[6:7], 0xf | |
s_load_dword s43, s[6:7], 0x12 | |
v_mov_b32_e32 v14, v13 | |
v_mov_b32_e32 v15, v13 | |
v_mov_b32_e32 v19, v16 | |
s_add_i32 s8, s14, 0x400 | |
v_lshlrev_b32_e32 v6, 2, v6 | |
v_lshlrev_b32_e32 v8, 2, v8 | |
v_add_i32_e32 v10, vcc, s8, v6 | |
v_lshlrev_b32_e32 v6, 2, v0 | |
v_add_i32_e32 v54, vcc, s8, v8 | |
v_lshlrev_b32_e32 v8, 4, v2 | |
v_mov_b32_e32 v18, v15 | |
v_mov_b32_e32 v17, v14 | |
v_mov_b32_e32 v16, v13 | |
v_add_i32_e32 v12, vcc, s14, v6 | |
v_mov_b32_e32 v23, v16 | |
v_mov_b32_e32 v27, v16 | |
v_mov_b32_e32 v31, v16 | |
v_mov_b32_e32 v35, v16 | |
v_mov_b32_e32 v45, v16 | |
v_mov_b32_e32 v51, v16 | |
v_mul_f32_e64 v9, s18, s18 | |
v_mov_b32_e32 v47, 0 | |
v_add_i32_e32 v55, vcc, s14, v8 | |
v_lshlrev_b32_e32 v8, 3, v2 | |
v_add_i32_e32 v56, vcc, s4, v8 | |
s_mov_b32 s46, 0 | |
v_and_b32_e32 v52, 31, v0 | |
v_mov_b32_e32 v53, v47 | |
v_cmp_gt_u32_e64 s[2:3], v1, v2 | |
v_mul_f32_e32 v37, s18, v9 | |
v_add_i32_e32 v6, vcc, s5, v12 | |
v_add_i32_e32 v7, vcc, 0x720, v12 | |
v_add_i32_e32 v12, vcc, 0x820, v12 | |
s_mov_b32 s47, 0xf000 | |
s_mov_b64 s[44:45], 0 | |
s_brev_b32 s50, -2 | |
s_mov_b32 s51, 0x7ffff000 | |
s_brev_b32 s52, 1 | |
v_ashrrev_i32_e32 v58, 31, v38 | |
v_mov_b32_e32 v57, v38 | |
v_or_b32_e32 v38, 7, v41 | |
v_or_b32_e32 v59, 6, v41 | |
v_or_b32_e32 v60, 5, v41 | |
v_or_b32_e32 v61, 4, v41 | |
v_or_b32_e32 v62, 3, v41 | |
v_or_b32_e32 v63, 2, v41 | |
v_or_b32_e32 v64, 1, v41 | |
v_mov_b32_e32 v22, v15 | |
v_mov_b32_e32 v21, v14 | |
v_mov_b32_e32 v20, v13 | |
v_mov_b32_e32 v26, v15 | |
v_mov_b32_e32 v25, v14 | |
v_mov_b32_e32 v24, v13 | |
v_mov_b32_e32 v30, v15 | |
v_mov_b32_e32 v29, v14 | |
v_mov_b32_e32 v28, v13 | |
v_mov_b32_e32 v34, v15 | |
v_mov_b32_e32 v33, v14 | |
v_mov_b32_e32 v32, v13 | |
v_mov_b32_e32 v44, v15 | |
v_mov_b32_e32 v43, v14 | |
v_mov_b32_e32 v42, v13 | |
v_mov_b32_e32 v50, v15 | |
v_mov_b32_e32 v49, v14 | |
v_mov_b32_e32 v48, v13 | |
v_mov_b32_e32 v8, v13 | |
; implicit-def: %VGPR65_VGPR66_VGPR67_VGPR68 | |
s_waitcnt lgkmcnt(0) | |
BB7_11: ; =>This Loop Header: Depth=1 | |
; Child Loop BB7_51 Depth 2 | |
; Child Loop BB7_91 Depth 2 | |
; Child Loop BB7_131 Depth 2 | |
; Child Loop BB7_171 Depth 2 | |
v_lshl_b64 v[65:66], v[57:58], 5 | |
v_add_i32_e32 v69, vcc, s40, v65 | |
v_mov_b32_e32 v65, s41 | |
v_addc_u32_e32 v66, vcc, v66, v65, vcc | |
v_lshl_b64 v[70:71], v[46:47], 3 | |
v_add_i32_e32 v69, vcc, v69, v70 | |
v_addc_u32_e32 v70, vcc, v66, v71, vcc | |
buffer_load_dwordx2 v[69:70], v[69:70], s[44:47], 0 addr64 offset:16 | |
s_waitcnt vmcnt(0) | |
v_cmp_ne_u32_e32 vcc, 0, v69 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[54:55], exec, s[4:5] | |
; mask branch BB7_175 | |
s_cbranch_execz BB7_175 | |
BB7_12: ; in Loop: Header=BB7_11 Depth=1 | |
v_ashrrev_i32_e32 v71, 31, v70 | |
v_lshl_b64 v[65:66], v[70:71], 7 | |
v_add_i32_e32 v70, vcc, s22, v65 | |
v_mov_b32_e32 v65, s23 | |
v_addc_u32_e32 v66, vcc, v66, v65, vcc | |
v_lshl_b64 v[71:72], v[52:53], 2 | |
v_add_i32_e32 v70, vcc, v70, v71 | |
v_addc_u32_e32 v71, vcc, v66, v72, vcc | |
buffer_load_dword v65, v[70:71], s[44:47], 0 addr64 | |
s_and_saveexec_b64 s[4:5], s[48:49] | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt vmcnt(0) | |
; mask branch BB7_14 | |
s_cbranch_execz BB7_14 | |
BB7_13: ; in Loop: Header=BB7_11 Depth=1 | |
v_lshl_b64 v[66:67], v[57:58], 5 | |
v_add_i32_e32 v70, vcc, s40, v66 | |
v_mov_b32_e32 v66, s41 | |
v_addc_u32_e32 v67, vcc, v67, v66, vcc | |
v_lshl_b64 v[71:72], v[2:3], 2 | |
v_add_i32_e32 v70, vcc, v70, v71 | |
v_addc_u32_e32 v71, vcc, v67, v72, vcc | |
buffer_load_dword v66, v[70:71], s[44:47], 0 addr64 | |
s_mov_b32 m0, -1 | |
s_waitcnt vmcnt(0) | |
ds_write_b32 v10, v66 | |
s_waitcnt lgkmcnt(0) | |
BB7_14: ; %.preheader.preheader | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
v_and_b32_e32 v66, 0xff, v69 | |
v_cmp_ne_u32_e32 vcc, 0, v66 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[56:57], exec, s[4:5] | |
; mask branch BB7_54 | |
s_cbranch_execz BB7_54 | |
BB7_15: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v77, v54 | |
s_mov_b64 s[34:35], s[46:47] | |
s_mov_b64 s[38:39], s[46:47] | |
v_mov_b32_e32 v76, 0 | |
v_mov_b32_e32 v81, v76 | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v66, 3, v77 | |
v_add_i32_e32 v66, vcc, v66, v1 | |
v_ashrrev_i32_e32 v67, 31, v66 | |
v_lshl_b64 v[70:71], v[66:67], 4 | |
v_lshl_b64 v[78:79], v[66:67], 3 | |
buffer_load_dwordx4 v[72:75], v[70:71], s[32:35], 0 addr64 | |
buffer_load_dwordx2 v[70:71], v[78:79], s[36:39], 0 addr64 | |
v_and_b32_e32 v67, 1, v69 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
v_mov_b32_e32 v78, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; implicit-def: %VGPR82_VGPR83_VGPR84_VGPR85 | |
s_waitcnt vmcnt(0) | |
; mask branch BB7_19 | |
s_cbranch_execz BB7_19 | |
BB7_16: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset1:1 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v41, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v82, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v79, v74, v88 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mul_f32_e32 v76, s26, v76 | |
v_cmp_lt_f32_e32 vcc, v83, v76 | |
v_mov_b32_e32 v76, 0 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_mov_b32_e32 v78, v76 | |
v_mov_b32_e32 v81, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; implicit-def: %VGPR82_VGPR83_VGPR84_VGPR85 | |
; mask branch BB7_18 | |
s_cbranch_execz BB7_18 | |
BB7_17: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
v_max_f32_e32 v78, 0x34cd15ae, v83 | |
ds_read_b64 v[83:84], v56 | |
v_and_b32_e32 v81, 1, v65 | |
v_cmp_eq_u32_e64 s[4:5], 1, v81 | |
v_cndmask_b32_e64 v85, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v76, v75, v89 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v81, v70, v83 | |
v_mul_f32_e32 v83, v71, v84 | |
v_rsq_f32_e32 v84, v78 | |
v_cmp_gt_f32_e32 vcc, s27, v78 | |
v_mov_b32_e32 v91, 0x3a92b707 | |
v_mov_b32_e32 v92, 0x3c739487 | |
v_mul_f32_e32 v86, v84, v84 | |
v_mul_f32_e32 v87, v86, v86 | |
v_mul_f32_e32 v87, v85, v87 | |
v_mul_f32_e32 v88, v86, v87 | |
v_mad_f32 v87, v87, v86, s42 | |
v_mad_f32 v89, v88, v88, s43 | |
v_mul_f32_e32 v87, 0xbe2aaaab, v87 | |
v_mul_f32_e32 v89, v89, v83 | |
v_mul_f32_e32 v87, v81, v87 | |
v_mac_f32_e32 v87, 0x3daaaaaa, v89 | |
v_cndmask_b32_e64 v89, 0, 1.0, vcc | |
v_mul_f32_e32 v90, v85, v89 | |
v_mac_f32_e32 v8, v87, v90 | |
v_mul_f32_e32 v87, v9, v78 | |
v_mul_f32_e32 v90, v87, v87 | |
v_madak_f32_e32 v91, v91, v90, 0x3ded3cb2 | |
v_madak_f32_e32 v92, v92, v90, 0x3f01e2bc | |
v_mad_f32 v91, v91, v90, 1.0 | |
v_mac_f32_e32 v91, v87, v92 | |
v_mov_b32_e32 v92, 0xb2951928 | |
v_madak_f32_e32 v92, v92, v90, 0xb85ffb93 | |
v_mov_b32_e32 v93, 0x35c55945 | |
v_mul_f32_e32 v78, s18, v78 | |
v_madak_f32_e32 v93, v93, v90, 0x3a83ca0c | |
v_madak_f32_e32 v92, v92, v90, 0xbc9ded90 | |
v_mul_f32_e32 v78, v84, v78 | |
v_madak_f32_e32 v93, v93, v90, 0x3d8eaf3b | |
v_madak_f32_e32 v90, v92, v90, 0xbf409397 | |
v_and_b32_e32 v92, s50, v78 | |
v_mov_b32_e32 v94, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v94, v92 | |
v_mul_f32_e32 v94, v92, v92 | |
v_rcp_f32_e32 v95, v94 | |
v_add_f32_e32 v96, -1.0, v92 | |
v_mov_b32_e32 v98, 0xbd777f97 | |
v_mov_b32_e32 v99, 0x4036db6e | |
v_cndmask_b32_e64 v95, v95, v96, s[4:5] | |
v_mov_b32_e32 v96, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v96, v92 | |
v_cndmask_b32_e64 v94, v95, v94, s[8:9] | |
v_mov_b32_e32 v96, 0xc1b38712 | |
v_madak_f32_e32 v98, v98, v94, 0x40d23f7c | |
v_madak_f32_e32 v96, v96, v94, 0x43ed43a7 | |
v_madak_f32_e32 v98, v94, v98, 0x42d9451f | |
v_madak_f32_e32 v96, v94, v96, 0x451f90ce | |
v_madak_f32_e32 v98, v94, v98, 0x43d6810b | |
v_madak_f32_e32 v96, v94, v96, 0x4547fdbb | |
v_madak_f32_e32 v98, v94, v98, 0x442158c9 | |
v_madak_f32_e32 v96, v94, v96, 0x44c01759 | |
v_madak_f32_e32 v98, v94, v98, 0x43d9486f | |
v_madak_f32_e32 v96, v94, v96, 0x43a2e571 | |
v_madak_f32_e32 v98, v94, v98, 0x4309a863 | |
v_mov_b32_e32 v97, 0xc11d077e | |
v_cmp_gt_f32_e32 vcc, v99, v92 | |
v_madak_f32_e32 v96, v94, v96, 0x41f2b459 | |
v_madak_f32_e32 v98, v94, v98, 0x419d35ce | |
v_mov_b32_e32 v95, 0xc3f1c275 | |
v_madak_f32_e32 v97, v97, v94, 0xc2a2932b | |
v_cndmask_b32_e32 v96, v96, v98, vcc | |
v_mov_b32_e32 v98, 0x3c445aa3 | |
v_madak_f32_e32 v95, v95, v94, 0xc480230b | |
v_madak_f32_e32 v97, v94, v97, 0xc3389ae7 | |
v_madak_f32_e32 v98, v98, v94, 0x3c5f6e13 | |
v_madak_f32_e32 v95, v94, v95, 0xc41f6441 | |
v_madak_f32_e32 v97, v94, v97, 0xc322658c | |
v_madak_f32_e32 v98, v94, v98, 0x3e013307 | |
v_madak_f32_e32 v95, v94, v95, 0xc320a2ea | |
v_madak_f32_e32 v97, v94, v97, 0xc2798057 | |
v_madak_f32_e32 v98, v94, v98, 0x3d931ae7 | |
v_madak_f32_e32 v95, v94, v95, 0xc18e104b | |
v_madak_f32_e32 v97, v94, v97, 0xc128f022 | |
v_madak_f32_e32 v98, v94, v98, 0x3f0a5785 | |
v_madak_f32_e32 v95, v94, v95, 0xbf4c9dd4 | |
v_madak_f32_e32 v97, v94, v97, 0xbf31a0b7 | |
v_madak_f32_e32 v98, v94, v98, 0x3dd9f331 | |
v_cndmask_b32_e64 v96, v96, v98, s[4:5] | |
v_mov_b32_e32 v98, 0xb684e21a | |
v_madak_f32_e32 v95, v94, v95, 0xbc21a092 | |
v_madak_f32_e32 v97, v94, v97, 0xbc21a093 | |
v_madak_f32_e32 v98, v98, v94, 0x390aee49 | |
v_cndmask_b32_e32 v95, v95, v97, vcc | |
v_mov_b32_e32 v97, 0xbb0df9c0 | |
v_madak_f32_e32 v97, v97, v94, 0x3d1151b3 | |
v_madak_f32_e32 v98, v94, v98, 0x3ba68116 | |
v_madak_f32_e32 v97, v94, v97, 0xbde31cc2 | |
v_madak_f32_e32 v98, v94, v98, 0x3d852a63 | |
v_madak_f32_e32 v97, v94, v97, 0x3ea2fe54 | |
v_madak_f32_e32 v98, v94, v98, 0x3ecbbbce | |
v_cndmask_b32_e64 v96, v96, v98, s[8:9] | |
v_madak_f32_e32 v97, v94, v97, 0xbebe9208 | |
v_madak_f32_e32 v97, v94, v97, 0x3ed46805 | |
v_mad_f32 v96, v94, v96, 1.0 | |
v_mov_b32_e32 v98, 0x6f800000 | |
v_madak_f32_e32 v97, v94, v97, 0xbb1acdc6 | |
v_cmp_gt_f32_e64 vcc, |v96|, v98 | |
v_mov_b32_e32 v99, 0x2f800000 | |
v_cndmask_b32_e64 v95, v95, v97, s[4:5] | |
v_mov_b32_e32 v97, 0xb7c756b1 | |
v_cndmask_b32_e32 v100, 1.0, v99, vcc | |
v_madak_f32_e32 v97, v97, v94, 0xbbbd1489 | |
v_mul_f32_e32 v96, v100, v96 | |
v_madak_f32_e32 v97, v94, v97, 0xbce9528f | |
v_rcp_f32_e32 v96, v96 | |
v_madak_f32_e32 v97, v94, v97, 0xbea66beb | |
v_madak_f32_e32 v94, v94, v97, 0x3e0375d4 | |
v_cndmask_b32_e64 v94, v95, v94, s[8:9] | |
v_mul_f32_e32 v94, v96, v94 | |
v_and_b32_e32 v95, s51, v78 | |
v_mov_b32_e32 v96, 0xbf100000 | |
v_mad_f32 v96, v95, -v95, v96 | |
v_cmp_gt_f32_e32 vcc, 0, v96 | |
v_cndmask_b32_e64 v97, 0.5, -0.5, vcc | |
v_mov_b32_e32 v101, 0x3fb8aa3b | |
v_mac_f32_e32 v97, v101, v96 | |
v_cvt_i32_f32_e32 v97, v97 | |
v_subrev_f32_e32 v115, v92, v95 | |
v_mul_f32_e32 v103, v94, v100 | |
v_add_f32_e32 v95, v92, v95 | |
v_mad_f32 v95, v95, v115, v103 | |
v_cmp_gt_f32_e64 s[10:11], 0, v95 | |
v_cndmask_b32_e64 v115, 0.5, -0.5, s[10:11] | |
v_cvt_f32_i32_e32 v102, v97 | |
v_mac_f32_e32 v115, v101, v95 | |
v_cvt_i32_f32_e32 v101, v115 | |
v_mov_b32_e32 v104, 0xbf317180 | |
v_mad_f32 v105, v104, v102, v96 | |
v_mov_b32_e32 v106, 0xb717f7d1 | |
v_mad_f32 v107, v106, v102, v105 | |
v_mul_f32_e32 v108, v107, v107 | |
v_mov_b32_e32 v109, 0xb5ddea0e | |
v_mov_b32_e32 v110, 0x3331bb4c | |
v_cvt_f32_i32_e32 v116, v101 | |
v_mad_f32 v111, v110, v108, v109 | |
v_mov_b32_e32 v112, 0x388ab355 | |
v_mad_f32 v111, v111, v108, v112 | |
v_mov_b32_e32 v113, 0xbb360b61 | |
v_mad_f32 v111, v111, v108, v113 | |
v_mov_b32_e32 v114, 0x3e2aaaab | |
v_mad_f32 v111, v111, v108, v114 | |
v_mac_f32_e32 v90, v87, v93 | |
v_mad_f32 v93, v104, v116, v95 | |
v_mad_f32 v108, -v108, v111, v107 | |
v_mad_f32 v104, v106, v116, v93 | |
v_mul_f32_e32 v87, v108, v107 | |
v_mul_f32_e32 v107, v104, v104 | |
v_sub_f32_e32 v111, 2.0, v108 | |
v_mac_f32_e32 v109, v110, v107 | |
v_cmp_gt_f32_e64 vcc, |v111|, v98 | |
v_mac_f32_e32 v112, v109, v107 | |
v_cndmask_b32_e32 v115, 1.0, v99, vcc | |
v_mac_f32_e32 v113, v112, v107 | |
v_mul_f32_e64 v111, v111, -v115 | |
v_mac_f32_e32 v114, v113, v107 | |
v_rcp_f32_e32 v111, v111 | |
v_mad_f32 v107, -v107, v114, v104 | |
v_sub_f32_e32 v108, 2.0, v107 | |
v_rcp_f32_e32 v91, v91 | |
v_cmp_gt_f32_e64 vcc, |v108|, v98 | |
v_cndmask_b32_e32 v109, 1.0, v99, vcc | |
v_mul_f32_e32 v87, v111, v87 | |
v_mul_f32_e64 v108, v108, -v109 | |
v_mul_f32_e32 v87, v87, v115 | |
v_rcp_f32_e32 v108, v108 | |
v_mad_f32 v87, -v102, v106, v87 | |
v_mul_f32_e32 v91, v37, v91 | |
v_subrev_f32_e32 v87, v105, v87 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v85, v86 | |
v_mul_f32_e32 v102, v107, v104 | |
v_mac_f32_e32 v90, v84, v91 | |
v_lshlrev_b32_e32 v91, 23, v97 | |
v_sub_f32_e32 v87, 1.0, v87 | |
v_mul_f32_e32 v102, v108, v102 | |
v_add_i32_e32 v87, vcc, v87, v91 | |
v_mov_b32_e32 v91, 0xc2aeac4f | |
v_mul_f32_e32 v102, v102, v109 | |
v_cmp_nlt_f32_e32 vcc, v96, v91 | |
v_mov_b32_e32 v97, 0x42b17218 | |
v_mad_f32 v102, -v116, v106, v102 | |
v_cndmask_b32_e32 v87, 0, v87, vcc | |
v_cmp_lt_f32_e32 vcc, v96, v97 | |
v_mov_b32_e32 v104, 0x7f800000 | |
v_subrev_f32_e32 v93, v93, v102 | |
v_cndmask_b32_e32 v87, v104, v87, vcc | |
v_cmp_u_f32_e32 vcc, v96, v96 | |
v_cndmask_b32_e32 v87, v87, v96, vcc | |
v_sub_f32_e32 v93, 1.0, v93 | |
v_lshlrev_b32_e32 v96, 23, v101 | |
v_add_i32_e32 v93, vcc, v93, v96 | |
v_cmp_nlt_f32_e32 vcc, v95, v91 | |
v_cndmask_b32_e32 v91, 0, v93, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v97 | |
v_cndmask_b32_e32 v91, v104, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_mul_f32_e32 v87, v91, v87 | |
v_mov_b32_e32 v91, 0x40c00000 | |
v_cmp_gt_f32_e64 s[10:11], v91, v92 | |
v_mov_b32_e32 v91, 0x31800000 | |
v_cmp_gt_f32_e64 vcc, |v92|, v98 | |
v_cmp_gt_f32_e64 s[12:13], v91, v92 | |
v_cndmask_b32_e32 v91, 1.0, v99, vcc | |
v_mul_f32_e32 v92, v91, v92 | |
v_rcp_f32_e32 v92, v92 | |
v_cmp_u_f32_e32 vcc, v78, v78 | |
v_mac_f32_e32 v51, v0, v67 | |
v_mul_f32_e32 v87, v92, v87 | |
v_mad_f32 v87, -v91, v87, 1.0 | |
v_madak_f32_e32 v91, v100, v94, 0x3f58560b | |
v_cndmask_b32_e64 v87, 1.0, v87, s[10:11] | |
v_cndmask_b32_e64 v87, v87, v91, s[4:5] | |
v_and_b32_e32 v91, s52, v78 | |
v_or_b32_e32 v87, v91, v87 | |
v_mad_f32 v91, v103, v78, v78 | |
v_cndmask_b32_e64 v87, v87, v91, s[8:9] | |
v_mul_f32_e32 v91, 0x3f8375d4, v78 | |
v_mac_f32_e32 v91, 0x41000000, v78 | |
v_mul_f32_e32 v91, 0x3e000000, v91 | |
v_cndmask_b32_e64 v87, v87, v91, s[12:13] | |
v_cndmask_b32_e32 v78, v87, v78, vcc | |
v_subrev_f32_e32 v78, v78, v85 | |
v_mul_f32_e64 v85, s19, -v85 | |
v_mac_f32_e32 v85, v78, v84 | |
v_mad_f32 v78, v83, v88, -v81 | |
v_mul_f32_e32 v81, v89, v86 | |
v_mul_f32_e32 v81, v88, v81 | |
v_mul_f32_e32 v83, v78, v81 | |
v_mac_f32_e32 v83, v90, v76 | |
v_mac_f32_e32 v5, v85, v76 | |
v_mad_f32 v50, v79, v83, v50 | |
v_mad_f32 v49, v80, v83, v49 | |
v_mad_f32 v48, v82, v83, v48 | |
v_mul_f32_e64 v81, v83, -v79 | |
v_mul_f32_e64 v78, v83, -v80 | |
v_mul_f32_e64 v76, v83, -v82 | |
v_mul_f32_e64 v85, v67, -v0 | |
BB7_18: ; %Flow1253 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_19: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
v_lshrrev_b32_e32 v67, 1, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_23 | |
s_cbranch_execz BB7_23 | |
BB7_20: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:16 offset1:17 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v64, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB7_22 | |
s_cbranch_execz BB7_22 | |
BB7_21: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 1, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:64 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v86 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v86 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v86, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v86 | |
v_mul_f32_e32 v90, v86, v86 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v86 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v86 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v86 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v86, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v86, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v81, -v84, v82, v81 | |
v_mad_f32 v44, v82, v84, v44 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v78, -v84, v80, v78 | |
v_mad_f32 v43, v80, v84, v43 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v79, v76 | |
v_mad_f32 v42, v79, v84, v42 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v86|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v86 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v86 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v86 | |
v_mad_f32 v86, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v86, 1.0, v86, vcc | |
v_cndmask_b32_e64 v86, v86, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v86, v90, v86 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v86, v86, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v86, v86, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v86, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v86, s19, -v88 | |
v_mac_f32_e32 v86, v83, v89 | |
v_mac_f32_e32 v5, v86, v87 | |
v_mac_f32_e32 v45, v0, v67 | |
BB7_22: ; %Flow1252 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB7_23: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 2, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_27 | |
s_cbranch_execz BB7_27 | |
BB7_24: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:32 offset1:33 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v63, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB7_26 | |
s_cbranch_execz BB7_26 | |
BB7_25: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 2, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:128 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v86 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v86 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v86, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v86 | |
v_mul_f32_e32 v90, v86, v86 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v86 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v86 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v86 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v86, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v86, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v81, -v84, v82, v81 | |
v_mad_f32 v34, v82, v84, v34 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v78, -v84, v80, v78 | |
v_mad_f32 v33, v80, v84, v33 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v79, v76 | |
v_mad_f32 v32, v79, v84, v32 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v86|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v86 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v86 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v86 | |
v_mad_f32 v86, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v86, 1.0, v86, vcc | |
v_cndmask_b32_e64 v86, v86, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v86, v90, v86 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v86, v86, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v86, v86, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v86, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v86, s19, -v88 | |
v_mac_f32_e32 v86, v83, v89 | |
v_mac_f32_e32 v5, v86, v87 | |
v_mac_f32_e32 v35, v0, v67 | |
BB7_26: ; %Flow1251 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB7_27: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 3, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_31 | |
s_cbranch_execz BB7_31 | |
BB7_28: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:48 offset1:49 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v62, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB7_30 | |
s_cbranch_execz BB7_30 | |
BB7_29: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 3, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:192 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v86 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v86 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v86, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v86 | |
v_mul_f32_e32 v90, v86, v86 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v86 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v86 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v86 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v86, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v86, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v81, -v84, v82, v81 | |
v_mad_f32 v30, v82, v84, v30 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v78, -v84, v80, v78 | |
v_mad_f32 v29, v80, v84, v29 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v79, v76 | |
v_mad_f32 v28, v79, v84, v28 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v86|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v86 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v86 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v86 | |
v_mad_f32 v86, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v86, 1.0, v86, vcc | |
v_cndmask_b32_e64 v86, v86, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v86, v90, v86 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v86, v86, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v86, v86, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v86, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v86, s19, -v88 | |
v_mac_f32_e32 v86, v83, v89 | |
v_mac_f32_e32 v5, v86, v87 | |
v_mac_f32_e32 v31, v0, v67 | |
BB7_30: ; %Flow1250 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB7_31: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 4, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_35 | |
s_cbranch_execz BB7_35 | |
BB7_32: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:64 offset1:65 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v61, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB7_34 | |
s_cbranch_execz BB7_34 | |
BB7_33: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 4, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:256 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v86 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v86 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v86, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v86 | |
v_mul_f32_e32 v90, v86, v86 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v86 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v86 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v86 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v86, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v86, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v81, -v84, v82, v81 | |
v_mad_f32 v26, v82, v84, v26 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v78, -v84, v80, v78 | |
v_mad_f32 v25, v80, v84, v25 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v79, v76 | |
v_mad_f32 v24, v79, v84, v24 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v86|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v86 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v86 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v86 | |
v_mad_f32 v86, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v86, 1.0, v86, vcc | |
v_cndmask_b32_e64 v86, v86, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v86, v90, v86 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v86, v86, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v86, v86, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v86, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v86, s19, -v88 | |
v_mac_f32_e32 v86, v83, v89 | |
v_mac_f32_e32 v5, v86, v87 | |
v_mac_f32_e32 v27, v0, v67 | |
BB7_34: ; %Flow1249 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB7_35: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 5, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_39 | |
s_cbranch_execz BB7_39 | |
BB7_36: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:80 offset1:81 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v60, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB7_38 | |
s_cbranch_execz BB7_38 | |
BB7_37: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 5, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:320 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v86 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v86 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v86, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v86 | |
v_mul_f32_e32 v90, v86, v86 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v86 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v86 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v86 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v86, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v86, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v81, -v84, v82, v81 | |
v_mad_f32 v22, v82, v84, v22 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v78, -v84, v80, v78 | |
v_mad_f32 v21, v80, v84, v21 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v79, v76 | |
v_mad_f32 v20, v79, v84, v20 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v86|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v86 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v86 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v86 | |
v_mad_f32 v86, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v86, 1.0, v86, vcc | |
v_cndmask_b32_e64 v86, v86, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v86, v90, v86 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v86, v86, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v86, v86, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v86, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v86, s19, -v88 | |
v_mac_f32_e32 v86, v83, v89 | |
v_mac_f32_e32 v5, v86, v87 | |
v_mac_f32_e32 v23, v0, v67 | |
BB7_38: ; %Flow1248 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB7_39: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 6, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_43 | |
s_cbranch_execz BB7_43 | |
BB7_40: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:96 offset1:97 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v59, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v79, v72, v86 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v82, v74, v88 | |
v_mac_f32_e32 v83, v79, v79 | |
v_mac_f32_e32 v83, v82, v82 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB7_42 | |
s_cbranch_execz BB7_42 | |
BB7_41: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v86, 0x34cd15ae, v83 | |
v_mul_f32_e32 v87, v75, v89 | |
v_rsq_f32_e32 v89, v86 | |
v_lshrrev_b32_e32 v83, 6, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:384 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v86 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v86 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v86 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v86, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v86 | |
v_mul_f32_e32 v90, v86, v86 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v86 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v86 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v86 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v86, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v86, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v81, -v84, v82, v81 | |
v_mad_f32 v18, v82, v84, v18 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v78, -v84, v80, v78 | |
v_mad_f32 v17, v80, v84, v17 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v79, v76 | |
v_mad_f32 v16, v79, v84, v16 | |
v_mad_f32 v85, -v67, v0, v85 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v86|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v86 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v86 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v86 | |
v_mad_f32 v86, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v86, 1.0, v86, vcc | |
v_cndmask_b32_e64 v86, v86, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v86, v90, v86 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v86, v86, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v86, v86, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v86, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v86, s19, -v88 | |
v_mac_f32_e32 v86, v83, v89 | |
v_mac_f32_e32 v5, v86, v87 | |
v_mac_f32_e32 v19, v0, v67 | |
BB7_42: ; %Flow1247 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB7_43: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 7, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_47 | |
s_cbranch_execz BB7_47 | |
BB7_44: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[86:89], v55 offset0:112 offset1:113 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v38, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v73, v73, v87 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v72, v72, v86 | |
v_mul_f32_e32 v77, v73, v73 | |
v_cndmask_b32_e64 v79, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v74, v74, v88 | |
v_mac_f32_e32 v77, v72, v72 | |
v_mac_f32_e32 v77, v74, v74 | |
v_mul_f32_e32 v79, s26, v79 | |
v_subrev_f32_e32 v67, v75, v89 | |
v_cmp_lt_f32_e32 vcc, v77, v79 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB7_46 | |
s_cbranch_execz BB7_46 | |
BB7_45: ; in Loop: Header=BB7_11 Depth=1 | |
v_lshrrev_b32_e32 v79, 7, v65 | |
v_and_b32_e32 v79, 1, v79 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e32 vcc, 1, v79 | |
ds_read_b64 v[79:80], v56 offset:448 | |
v_max_f32_e32 v77, 0x34cd15ae, v77 | |
v_mad_f32 v83, -v67, v0, v85 | |
v_rsq_f32_e32 v83, v77 | |
v_cndmask_b32_e64 v82, 0, 1.0, vcc | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v71, v71, v80 | |
v_cmp_gt_f32_e32 vcc, s27, v77 | |
v_mul_f32_e32 v80, v83, v83 | |
v_mul_f32_e32 v84, v80, v80 | |
v_mul_f32_e32 v84, v82, v84 | |
v_mul_f32_e32 v85, v80, v84 | |
v_mad_f32 v84, v84, v80, s42 | |
v_mad_f32 v86, v85, v85, s43 | |
v_mul_f32_e32 v70, v70, v79 | |
v_mul_f32_e32 v84, 0xbe2aaaab, v84 | |
v_mul_f32_e32 v84, v70, v84 | |
v_mul_f32_e32 v86, v86, v71 | |
v_cndmask_b32_e64 v79, 0, 1.0, vcc | |
v_mac_f32_e32 v84, 0x3daaaaaa, v86 | |
v_mul_f32_e32 v86, v82, v79 | |
v_mac_f32_e32 v8, v84, v86 | |
v_mul_f32_e32 v86, v9, v77 | |
v_mul_f32_e32 v87, v86, v86 | |
v_mov_b32_e32 v88, 0x3a92b707 | |
v_madak_f32_e32 v88, v88, v87, 0x3ded3cb2 | |
v_mul_f32_e32 v75, v75, v89 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v89, v89, v87, 0x3f01e2bc | |
v_mad_f32 v88, v88, v87, 1.0 | |
v_mac_f32_e32 v88, v86, v89 | |
v_mov_b32_e32 v89, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v87, 0xb85ffb93 | |
v_mov_b32_e32 v90, 0x35c55945 | |
v_madak_f32_e32 v90, v90, v87, 0x3a83ca0c | |
v_madak_f32_e32 v89, v89, v87, 0xbc9ded90 | |
v_madak_f32_e32 v90, v90, v87, 0x3d8eaf3b | |
v_madak_f32_e32 v87, v89, v87, 0xbf409397 | |
v_mac_f32_e32 v87, v86, v90 | |
v_rcp_f32_e32 v86, v88 | |
v_mul_f32_e32 v77, s18, v77 | |
v_mul_f32_e32 v77, v83, v77 | |
v_mul_f32_e32 v84, v82, v80 | |
v_mul_f32_e32 v86, v37, v86 | |
v_mul_f32_e32 v86, v87, v86 | |
v_mac_f32_e32 v86, v83, v84 | |
v_and_b32_e32 v84, s50, v77 | |
v_mov_b32_e32 v87, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v87, v84 | |
v_mul_f32_e32 v87, v84, v84 | |
v_rcp_f32_e32 v88, v87 | |
v_add_f32_e32 v89, -1.0, v84 | |
v_mov_b32_e32 v90, 0xbd777f97 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cndmask_b32_e64 v88, v88, v89, s[4:5] | |
v_mov_b32_e32 v89, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v89, v84 | |
v_cndmask_b32_e64 v87, v88, v87, s[8:9] | |
v_mov_b32_e32 v89, 0xc11d077e | |
v_mov_b32_e32 v88, 0x4036db6e | |
v_madak_f32_e32 v89, v89, v87, 0xc2a2932b | |
v_cmp_gt_f32_e32 vcc, v88, v84 | |
v_mov_b32_e32 v88, 0xc3f1c275 | |
v_madak_f32_e32 v88, v88, v87, 0xc480230b | |
v_madak_f32_e32 v89, v87, v89, 0xc3389ae7 | |
v_madak_f32_e32 v88, v87, v88, 0xc41f6441 | |
v_madak_f32_e32 v89, v87, v89, 0xc322658c | |
v_madak_f32_e32 v88, v87, v88, 0xc320a2ea | |
v_madak_f32_e32 v89, v87, v89, 0xc2798057 | |
v_madak_f32_e32 v88, v87, v88, 0xc18e104b | |
v_madak_f32_e32 v89, v87, v89, 0xc128f022 | |
v_madak_f32_e32 v88, v87, v88, 0xbf4c9dd4 | |
v_madak_f32_e32 v89, v87, v89, 0xbf31a0b7 | |
v_madak_f32_e32 v88, v87, v88, 0xbc21a092 | |
v_madak_f32_e32 v89, v87, v89, 0xbc21a093 | |
v_madak_f32_e32 v90, v90, v87, 0x40d23f7c | |
v_cndmask_b32_e32 v88, v88, v89, vcc | |
v_mov_b32_e32 v89, 0xc1b38712 | |
v_madak_f32_e32 v89, v89, v87, 0x43ed43a7 | |
v_madak_f32_e32 v90, v87, v90, 0x42d9451f | |
v_madak_f32_e32 v89, v87, v89, 0x451f90ce | |
v_madak_f32_e32 v90, v87, v90, 0x43d6810b | |
v_madak_f32_e32 v89, v87, v89, 0x4547fdbb | |
v_madak_f32_e32 v90, v87, v90, 0x442158c9 | |
v_madak_f32_e32 v89, v87, v89, 0x44c01759 | |
v_madak_f32_e32 v90, v87, v90, 0x43d9486f | |
v_madak_f32_e32 v89, v87, v89, 0x43a2e571 | |
v_madak_f32_e32 v90, v87, v90, 0x4309a863 | |
v_madak_f32_e32 v89, v87, v89, 0x41f2b459 | |
v_madak_f32_e32 v90, v87, v90, 0x419d35ce | |
v_cndmask_b32_e32 v89, v89, v90, vcc | |
v_mov_b32_e32 v90, 0xbb0df9c0 | |
v_madak_f32_e32 v90, v90, v87, 0x3d1151b3 | |
v_madak_f32_e32 v90, v87, v90, 0xbde31cc2 | |
v_madak_f32_e32 v90, v87, v90, 0x3ea2fe54 | |
v_madak_f32_e32 v90, v87, v90, 0xbebe9208 | |
v_madak_f32_e32 v90, v87, v90, 0x3ed46805 | |
v_madak_f32_e32 v90, v87, v90, 0xbb1acdc6 | |
v_cndmask_b32_e64 v88, v88, v90, s[4:5] | |
v_mov_b32_e32 v90, 0x3c445aa3 | |
v_madak_f32_e32 v90, v90, v87, 0x3c5f6e13 | |
v_madak_f32_e32 v90, v87, v90, 0x3e013307 | |
v_madak_f32_e32 v90, v87, v90, 0x3d931ae7 | |
v_madak_f32_e32 v90, v87, v90, 0x3f0a5785 | |
v_madak_f32_e32 v90, v87, v90, 0x3dd9f331 | |
v_cndmask_b32_e64 v89, v89, v90, s[4:5] | |
v_mov_b32_e32 v90, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v87, 0x390aee49 | |
v_madak_f32_e32 v90, v87, v90, 0x3ba68116 | |
v_madak_f32_e32 v90, v87, v90, 0x3d852a63 | |
v_madak_f32_e32 v90, v87, v90, 0x3ecbbbce | |
v_cndmask_b32_e64 v89, v89, v90, s[8:9] | |
v_mad_f32 v89, v87, v89, 1.0 | |
v_and_b32_e32 v94, s51, v77 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_cmp_gt_f32_e64 vcc, |v89|, v91 | |
v_mov_b32_e32 v92, 0x2f800000 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v93, 1.0, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_mov_b32_e32 v90, 0xb7c756b1 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_madak_f32_e32 v90, v90, v87, 0xbbbd1489 | |
v_madak_f32_e32 v90, v87, v90, 0xbce9528f | |
v_madak_f32_e32 v90, v87, v90, 0xbea66beb | |
v_mul_f32_e32 v89, v93, v89 | |
v_madak_f32_e32 v87, v87, v90, 0x3e0375d4 | |
v_rcp_f32_e32 v89, v89 | |
v_cvt_f32_i32_e32 v90, v96 | |
v_cndmask_b32_e64 v87, v88, v87, s[8:9] | |
v_mov_b32_e32 v88, 0xbf317180 | |
v_mul_f32_e32 v87, v89, v87 | |
v_mad_f32 v89, v88, v90, v95 | |
v_mov_b32_e32 v98, 0xb717f7d1 | |
v_mad_f32 v99, v98, v90, v89 | |
v_mul_f32_e32 v100, v99, v99 | |
v_mov_b32_e32 v101, 0xb5ddea0e | |
v_mov_b32_e32 v102, 0x3331bb4c | |
v_mad_f32 v103, v102, v100, v101 | |
v_mov_b32_e32 v104, 0x388ab355 | |
v_mad_f32 v103, v103, v100, v104 | |
v_mov_b32_e32 v105, 0xbb360b61 | |
v_mad_f32 v103, v103, v100, v105 | |
v_mov_b32_e32 v106, 0x3e2aaaab | |
v_mad_f32 v103, v103, v100, v106 | |
v_mad_f32 v100, -v100, v103, v99 | |
v_mul_f32_e32 v99, v100, v99 | |
v_sub_f32_e32 v100, 2.0, v100 | |
v_cmp_gt_f32_e64 vcc, |v100|, v91 | |
v_cndmask_b32_e32 v103, 1.0, v92, vcc | |
v_mul_f32_e64 v100, v100, -v103 | |
v_rcp_f32_e32 v100, v100 | |
v_mad_f32 v70, v71, v85, -v70 | |
v_mul_f32_e32 v71, v79, v80 | |
v_mul_f32_e32 v71, v85, v71 | |
v_mul_f32_e32 v99, v100, v99 | |
v_mul_f32_e32 v99, v99, v103 | |
v_mad_f32 v90, -v90, v98, v99 | |
v_subrev_f32_e32 v89, v89, v90 | |
v_lshlrev_b32_e32 v90, 23, v96 | |
v_sub_f32_e32 v89, 1.0, v89 | |
v_add_i32_e32 v89, vcc, v89, v90 | |
v_mov_b32_e32 v90, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v90 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v89, 0, v89, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v99, 0x7f800000 | |
v_cndmask_b32_e32 v89, v99, v89, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v89, v89, v95, vcc | |
v_subrev_f32_e32 v95, v84, v94 | |
v_mul_f32_e32 v100, v87, v93 | |
v_add_f32_e32 v94, v84, v94 | |
v_mad_f32 v94, v94, v95, v100 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v87, v93, v87, 0x3f58560b | |
v_mul_f32_e32 v70, v70, v71 | |
v_mac_f32_e32 v70, v86, v75 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v81, -v70, v74, v81 | |
v_mad_f32 v15, v74, v70, v15 | |
v_mad_f32 v88, v88, v97, v94 | |
v_mad_f32 v103, v98, v97, v88 | |
v_mul_f32_e32 v107, v103, v103 | |
v_mac_f32_e32 v101, v102, v107 | |
v_mac_f32_e32 v104, v101, v107 | |
v_mac_f32_e32 v105, v104, v107 | |
v_mac_f32_e32 v106, v105, v107 | |
v_mad_f32 v101, -v107, v106, v103 | |
v_mul_f32_e32 v102, v101, v103 | |
v_sub_f32_e32 v101, 2.0, v101 | |
v_cmp_gt_f32_e64 vcc, |v101|, v91 | |
v_cndmask_b32_e32 v103, 1.0, v92, vcc | |
v_mul_f32_e64 v101, v101, -v103 | |
v_rcp_f32_e32 v101, v101 | |
v_mad_f32 v78, -v70, v73, v78 | |
v_mad_f32 v14, v73, v70, v14 | |
v_mad_f32 v76, -v70, v72, v76 | |
v_mul_f32_e32 v101, v101, v102 | |
v_mul_f32_e32 v101, v101, v103 | |
v_mad_f32 v97, -v97, v98, v101 | |
v_subrev_f32_e32 v88, v88, v97 | |
v_sub_f32_e32 v88, 1.0, v88 | |
v_add_i32_e32 v88, vcc, v88, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v90 | |
v_cndmask_b32_e32 v88, 0, v88, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v88, v99, v88, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v88, v88, v94, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_mul_f32_e32 v88, v88, v89 | |
v_cndmask_b32_e32 v89, 1.0, v92, vcc | |
v_mul_f32_e32 v90, v89, v84 | |
v_rcp_f32_e32 v90, v90 | |
v_mov_b32_e32 v91, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v91, v84 | |
v_mov_b32_e32 v91, 0x31800000 | |
v_cmp_gt_f32_e64 s[10:11], v91, v84 | |
v_mul_f32_e32 v84, v90, v88 | |
v_mad_f32 v84, -v89, v84, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v87, s[4:5] | |
v_and_b32_e32 v87, s52, v77 | |
v_or_b32_e32 v84, v87, v84 | |
v_mad_f32 v87, v100, v77, v77 | |
v_cndmask_b32_e64 v84, v84, v87, s[8:9] | |
v_mul_f32_e32 v87, 0x3f8375d4, v77 | |
v_mac_f32_e32 v87, 0x41000000, v77 | |
v_mul_f32_e32 v87, 0x3e000000, v87 | |
v_cndmask_b32_e64 v84, v84, v87, s[10:11] | |
v_cmp_u_f32_e32 vcc, v77, v77 | |
v_cndmask_b32_e32 v77, v84, v77, vcc | |
v_subrev_f32_e32 v77, v77, v82 | |
v_mul_f32_e64 v82, s19, -v82 | |
v_mac_f32_e32 v82, v77, v83 | |
v_mac_f32_e32 v5, v82, v75 | |
v_mad_f32 v13, v72, v70, v13 | |
v_mac_f32_e32 v68, v0, v67 | |
BB7_46: ; %Flow1246 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
BB7_47: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v6, v76 | |
ds_write_b32 v7, v78 | |
ds_write_b32 v12, v81 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB7_53 | |
s_cbranch_execz BB7_53 | |
BB7_48: ; in Loop: Header=BB7_11 Depth=1 | |
v_lshlrev_b32_e32 v70, 6, v2 | |
v_add_i32_e32 v67, vcc, v11, v70 | |
v_lshlrev_b32_e32 v67, 2, v67 | |
v_add_i32_e32 v71, vcc, s15, v67 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v67, v71 | |
v_add_i32_e32 v72, vcc, 8, v11 | |
v_or_b32_e32 v73, 1, v11 | |
v_cmp_lt_i32_e32 vcc, v73, v72 | |
s_and_saveexec_b64 s[8:9], vcc | |
s_xor_b64 s[8:9], exec, s[8:9] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB7_50 | |
s_cbranch_execz BB7_50 | |
BB7_49: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[72:73], v71 offset0:1 offset1:2 | |
v_or_b32_e32 v76, 3, v11 | |
v_add_i32_e32 v70, vcc, v76, v70 | |
v_lshlrev_b32_e32 v70, 2, v70 | |
ds_read2_b32 v[74:75], v71 offset0:3 offset1:4 | |
v_add_i32_e32 v70, vcc, s15, v70 | |
ds_read_b32 v77, v71 offset:28 | |
ds_read2_b32 v[70:71], v70 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v67, v67, v72 | |
v_add_f32_e32 v67, v73, v67 | |
v_add_f32_e32 v67, v74, v67 | |
v_add_f32_e32 v67, v75, v67 | |
v_add_f32_e32 v67, v70, v67 | |
v_add_f32_e32 v67, v71, v67 | |
v_add_f32_e32 v67, v77, v67 | |
BB7_50: ; %._crit_edge.i118 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
v_mul_lo_i32 v66, v66, 3 | |
v_mov_b32_e32 v74, s29 | |
s_mov_b64 s[30:31], s[46:47] | |
s_mov_b64 s[8:9], 0 | |
v_add_i32_e32 v70, vcc, v66, v2 | |
v_ashrrev_i32_e32 v71, 31, v70 | |
v_lshl_b64 v[72:73], v[70:71], 2 | |
v_add_i32_e32 v70, vcc, s28, v72 | |
v_addc_u32_e32 v71, vcc, v73, v74, vcc | |
buffer_load_dword v73, v[72:73], s[28:31], 0 addr64 | |
s_waitcnt vmcnt(0) | |
BB7_51: ; Parent Loop BB7_11 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v72, v67, v73 | |
v_mov_b32_e32 v75, v73 | |
v_mov_b32_e32 v74, v72 | |
buffer_atomic_cmpswap v[74:75], v[70:71], s[44:47], 0 addr64 glc | |
v_mov_b32_e32 v66, -1 | |
v_mov_b32_e32 v66, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v74, v73 | |
s_or_b64 s[8:9], vcc, s[8:9] | |
v_mov_b32_e32 v73, v74 | |
s_andn2_b64 exec, exec, s[8:9] | |
s_cbranch_execnz BB7_51 | |
; BB#52: ; %Flow1244 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
BB7_53: ; %Flow1245 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
BB7_54: ; %Flow1254 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[56:57] | |
v_and_b32_e32 v66, 0xff00, v69 | |
v_cmp_ne_u32_e32 vcc, 0, v66 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB7_94 | |
s_cbranch_execz BB7_94 | |
BB7_55: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v77, v54 offset:4 | |
s_mov_b64 s[8:9], s[32:33] | |
s_mov_b64 s[10:11], s[46:47] | |
v_mov_b32_e32 v76, 0 | |
v_mov_b32_e32 v82, v76 | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v66, 3, v77 | |
v_add_i32_e32 v66, vcc, v66, v1 | |
v_ashrrev_i32_e32 v67, 31, v66 | |
v_lshl_b64 v[70:71], v[66:67], 4 | |
v_lshl_b64 v[78:79], v[66:67], 3 | |
buffer_load_dwordx4 v[72:75], v[70:71], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], s[36:37] | |
buffer_load_dwordx2 v[70:71], v[78:79], s[8:11], 0 addr64 | |
v_lshrrev_b32_e32 v67, 8, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
v_mov_b32_e32 v79, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; implicit-def: %VGPR83_VGPR84_VGPR85_VGPR86 | |
s_waitcnt vmcnt(0) | |
; mask branch BB7_59 | |
s_cbranch_execz BB7_59 | |
BB7_56: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset1:1 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v41, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v81, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v78, v74, v89 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mul_f32_e32 v76, s26, v76 | |
v_cmp_lt_f32_e32 vcc, v83, v76 | |
v_mov_b32_e32 v76, 0 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_mov_b32_e32 v79, v76 | |
v_mov_b32_e32 v82, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[38:39], exec, s[4:5] | |
; implicit-def: %VGPR83_VGPR84_VGPR85_VGPR86 | |
; mask branch BB7_58 | |
s_cbranch_execz BB7_58 | |
BB7_57: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v79, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v84, v79 | |
v_lshrrev_b32_e32 v82, 8, v65 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v82 | |
ds_read_b64 v[82:83], v56 | |
v_mul_f32_e32 v86, v84, v84 | |
v_cndmask_b32_e64 v85, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v87, v86, v86 | |
v_mul_f32_e32 v87, v85, v87 | |
v_mul_f32_e32 v88, v86, v87 | |
v_mad_f32 v87, v87, v86, s42 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v70, v82 | |
v_mul_f32_e32 v87, 0xbe2aaaab, v87 | |
v_mul_f32_e32 v83, v71, v83 | |
v_mad_f32 v89, v88, v88, s43 | |
v_cmp_gt_f32_e32 vcc, s27, v79 | |
v_mul_f32_e32 v89, v89, v83 | |
v_mul_f32_e32 v87, v82, v87 | |
v_mac_f32_e32 v87, 0x3daaaaaa, v89 | |
v_cndmask_b32_e64 v89, 0, 1.0, vcc | |
v_mul_f32_e32 v76, v75, v90 | |
v_mul_f32_e32 v90, v85, v89 | |
v_mac_f32_e32 v8, v87, v90 | |
v_mul_f32_e32 v87, v9, v79 | |
v_mul_f32_e32 v90, v87, v87 | |
v_mov_b32_e32 v91, 0x3a92b707 | |
v_madak_f32_e32 v91, v91, v90, 0x3ded3cb2 | |
v_mov_b32_e32 v92, 0x3c739487 | |
v_madak_f32_e32 v92, v92, v90, 0x3f01e2bc | |
v_mad_f32 v91, v91, v90, 1.0 | |
v_mac_f32_e32 v91, v87, v92 | |
v_mov_b32_e32 v92, 0xb2951928 | |
v_madak_f32_e32 v92, v92, v90, 0xb85ffb93 | |
v_mov_b32_e32 v93, 0x35c55945 | |
v_mul_f32_e32 v79, s18, v79 | |
v_madak_f32_e32 v93, v93, v90, 0x3a83ca0c | |
v_madak_f32_e32 v92, v92, v90, 0xbc9ded90 | |
v_mul_f32_e32 v79, v84, v79 | |
v_madak_f32_e32 v93, v93, v90, 0x3d8eaf3b | |
v_madak_f32_e32 v90, v92, v90, 0xbf409397 | |
v_and_b32_e32 v92, s50, v79 | |
v_mov_b32_e32 v94, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v94, v92 | |
v_mul_f32_e32 v94, v92, v92 | |
v_rcp_f32_e32 v95, v94 | |
v_add_f32_e32 v96, -1.0, v92 | |
v_mov_b32_e32 v98, 0xbd777f97 | |
v_mov_b32_e32 v99, 0x4036db6e | |
v_cndmask_b32_e64 v95, v95, v96, s[4:5] | |
v_mov_b32_e32 v96, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v96, v92 | |
v_cndmask_b32_e64 v94, v95, v94, s[8:9] | |
v_mov_b32_e32 v96, 0xc1b38712 | |
v_madak_f32_e32 v98, v98, v94, 0x40d23f7c | |
v_madak_f32_e32 v96, v96, v94, 0x43ed43a7 | |
v_madak_f32_e32 v98, v94, v98, 0x42d9451f | |
v_madak_f32_e32 v96, v94, v96, 0x451f90ce | |
v_madak_f32_e32 v98, v94, v98, 0x43d6810b | |
v_madak_f32_e32 v96, v94, v96, 0x4547fdbb | |
v_madak_f32_e32 v98, v94, v98, 0x442158c9 | |
v_madak_f32_e32 v96, v94, v96, 0x44c01759 | |
v_madak_f32_e32 v98, v94, v98, 0x43d9486f | |
v_madak_f32_e32 v96, v94, v96, 0x43a2e571 | |
v_madak_f32_e32 v98, v94, v98, 0x4309a863 | |
v_mov_b32_e32 v97, 0xc11d077e | |
v_cmp_gt_f32_e32 vcc, v99, v92 | |
v_madak_f32_e32 v96, v94, v96, 0x41f2b459 | |
v_madak_f32_e32 v98, v94, v98, 0x419d35ce | |
v_mov_b32_e32 v95, 0xc3f1c275 | |
v_madak_f32_e32 v97, v97, v94, 0xc2a2932b | |
v_cndmask_b32_e32 v96, v96, v98, vcc | |
v_mov_b32_e32 v98, 0x3c445aa3 | |
v_madak_f32_e32 v95, v95, v94, 0xc480230b | |
v_madak_f32_e32 v97, v94, v97, 0xc3389ae7 | |
v_madak_f32_e32 v98, v98, v94, 0x3c5f6e13 | |
v_madak_f32_e32 v95, v94, v95, 0xc41f6441 | |
v_madak_f32_e32 v97, v94, v97, 0xc322658c | |
v_madak_f32_e32 v98, v94, v98, 0x3e013307 | |
v_madak_f32_e32 v95, v94, v95, 0xc320a2ea | |
v_madak_f32_e32 v97, v94, v97, 0xc2798057 | |
v_madak_f32_e32 v98, v94, v98, 0x3d931ae7 | |
v_madak_f32_e32 v95, v94, v95, 0xc18e104b | |
v_madak_f32_e32 v97, v94, v97, 0xc128f022 | |
v_madak_f32_e32 v98, v94, v98, 0x3f0a5785 | |
v_madak_f32_e32 v95, v94, v95, 0xbf4c9dd4 | |
v_madak_f32_e32 v97, v94, v97, 0xbf31a0b7 | |
v_madak_f32_e32 v98, v94, v98, 0x3dd9f331 | |
v_cndmask_b32_e64 v96, v96, v98, s[4:5] | |
v_mov_b32_e32 v98, 0xb684e21a | |
v_madak_f32_e32 v95, v94, v95, 0xbc21a092 | |
v_madak_f32_e32 v97, v94, v97, 0xbc21a093 | |
v_madak_f32_e32 v98, v98, v94, 0x390aee49 | |
v_cndmask_b32_e32 v95, v95, v97, vcc | |
v_mov_b32_e32 v97, 0xbb0df9c0 | |
v_madak_f32_e32 v97, v97, v94, 0x3d1151b3 | |
v_madak_f32_e32 v98, v94, v98, 0x3ba68116 | |
v_madak_f32_e32 v97, v94, v97, 0xbde31cc2 | |
v_madak_f32_e32 v98, v94, v98, 0x3d852a63 | |
v_madak_f32_e32 v97, v94, v97, 0x3ea2fe54 | |
v_madak_f32_e32 v98, v94, v98, 0x3ecbbbce | |
v_cndmask_b32_e64 v96, v96, v98, s[8:9] | |
v_madak_f32_e32 v97, v94, v97, 0xbebe9208 | |
v_madak_f32_e32 v97, v94, v97, 0x3ed46805 | |
v_mad_f32 v96, v94, v96, 1.0 | |
v_mov_b32_e32 v98, 0x6f800000 | |
v_madak_f32_e32 v97, v94, v97, 0xbb1acdc6 | |
v_cmp_gt_f32_e64 vcc, |v96|, v98 | |
v_mov_b32_e32 v99, 0x2f800000 | |
v_cndmask_b32_e64 v95, v95, v97, s[4:5] | |
v_mov_b32_e32 v97, 0xb7c756b1 | |
v_cndmask_b32_e32 v100, 1.0, v99, vcc | |
v_madak_f32_e32 v97, v97, v94, 0xbbbd1489 | |
v_mul_f32_e32 v96, v100, v96 | |
v_madak_f32_e32 v97, v94, v97, 0xbce9528f | |
v_rcp_f32_e32 v96, v96 | |
v_madak_f32_e32 v97, v94, v97, 0xbea66beb | |
v_madak_f32_e32 v94, v94, v97, 0x3e0375d4 | |
v_cndmask_b32_e64 v94, v95, v94, s[8:9] | |
v_mul_f32_e32 v94, v96, v94 | |
v_and_b32_e32 v95, s51, v79 | |
v_mov_b32_e32 v96, 0xbf100000 | |
v_mad_f32 v96, v95, -v95, v96 | |
v_cmp_gt_f32_e32 vcc, 0, v96 | |
v_cndmask_b32_e64 v97, 0.5, -0.5, vcc | |
v_mov_b32_e32 v101, 0x3fb8aa3b | |
v_mac_f32_e32 v97, v101, v96 | |
v_cvt_i32_f32_e32 v97, v97 | |
v_subrev_f32_e32 v115, v92, v95 | |
v_mul_f32_e32 v103, v94, v100 | |
v_add_f32_e32 v95, v92, v95 | |
v_mad_f32 v95, v95, v115, v103 | |
v_cmp_gt_f32_e64 s[10:11], 0, v95 | |
v_cndmask_b32_e64 v115, 0.5, -0.5, s[10:11] | |
v_cvt_f32_i32_e32 v102, v97 | |
v_mac_f32_e32 v115, v101, v95 | |
v_cvt_i32_f32_e32 v101, v115 | |
v_mov_b32_e32 v104, 0xbf317180 | |
v_mad_f32 v105, v104, v102, v96 | |
v_mov_b32_e32 v106, 0xb717f7d1 | |
v_mad_f32 v107, v106, v102, v105 | |
v_mul_f32_e32 v108, v107, v107 | |
v_mov_b32_e32 v109, 0xb5ddea0e | |
v_mov_b32_e32 v110, 0x3331bb4c | |
v_cvt_f32_i32_e32 v116, v101 | |
v_mad_f32 v111, v110, v108, v109 | |
v_mov_b32_e32 v112, 0x388ab355 | |
v_mad_f32 v111, v111, v108, v112 | |
v_mov_b32_e32 v113, 0xbb360b61 | |
v_mad_f32 v111, v111, v108, v113 | |
v_mov_b32_e32 v114, 0x3e2aaaab | |
v_mad_f32 v111, v111, v108, v114 | |
v_mac_f32_e32 v90, v87, v93 | |
v_mad_f32 v93, v104, v116, v95 | |
v_mad_f32 v108, -v108, v111, v107 | |
v_mad_f32 v104, v106, v116, v93 | |
v_mul_f32_e32 v87, v108, v107 | |
v_mul_f32_e32 v107, v104, v104 | |
v_sub_f32_e32 v111, 2.0, v108 | |
v_mac_f32_e32 v109, v110, v107 | |
v_cmp_gt_f32_e64 vcc, |v111|, v98 | |
v_mac_f32_e32 v112, v109, v107 | |
v_cndmask_b32_e32 v115, 1.0, v99, vcc | |
v_mac_f32_e32 v113, v112, v107 | |
v_mul_f32_e64 v111, v111, -v115 | |
v_mac_f32_e32 v114, v113, v107 | |
v_rcp_f32_e32 v111, v111 | |
v_mad_f32 v107, -v107, v114, v104 | |
v_sub_f32_e32 v108, 2.0, v107 | |
v_rcp_f32_e32 v91, v91 | |
v_cmp_gt_f32_e64 vcc, |v108|, v98 | |
v_cndmask_b32_e32 v109, 1.0, v99, vcc | |
v_mul_f32_e32 v87, v111, v87 | |
v_mul_f32_e64 v108, v108, -v109 | |
v_mul_f32_e32 v87, v87, v115 | |
v_rcp_f32_e32 v108, v108 | |
v_mad_f32 v87, -v102, v106, v87 | |
v_mul_f32_e32 v91, v37, v91 | |
v_subrev_f32_e32 v87, v105, v87 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v85, v86 | |
v_mul_f32_e32 v102, v107, v104 | |
v_mac_f32_e32 v90, v84, v91 | |
v_lshlrev_b32_e32 v91, 23, v97 | |
v_sub_f32_e32 v87, 1.0, v87 | |
v_mul_f32_e32 v102, v108, v102 | |
v_add_i32_e32 v87, vcc, v87, v91 | |
v_mov_b32_e32 v91, 0xc2aeac4f | |
v_mul_f32_e32 v102, v102, v109 | |
v_cmp_nlt_f32_e32 vcc, v96, v91 | |
v_mov_b32_e32 v97, 0x42b17218 | |
v_mad_f32 v102, -v116, v106, v102 | |
v_cndmask_b32_e32 v87, 0, v87, vcc | |
v_cmp_lt_f32_e32 vcc, v96, v97 | |
v_mov_b32_e32 v104, 0x7f800000 | |
v_subrev_f32_e32 v93, v93, v102 | |
v_cndmask_b32_e32 v87, v104, v87, vcc | |
v_cmp_u_f32_e32 vcc, v96, v96 | |
v_cndmask_b32_e32 v87, v87, v96, vcc | |
v_sub_f32_e32 v93, 1.0, v93 | |
v_lshlrev_b32_e32 v96, 23, v101 | |
v_add_i32_e32 v93, vcc, v93, v96 | |
v_cmp_nlt_f32_e32 vcc, v95, v91 | |
v_cndmask_b32_e32 v91, 0, v93, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v97 | |
v_cndmask_b32_e32 v91, v104, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_mul_f32_e32 v87, v91, v87 | |
v_mov_b32_e32 v91, 0x40c00000 | |
v_cmp_gt_f32_e64 s[10:11], v91, v92 | |
v_mov_b32_e32 v91, 0x31800000 | |
v_cmp_gt_f32_e64 vcc, |v92|, v98 | |
v_cmp_gt_f32_e64 s[12:13], v91, v92 | |
v_cndmask_b32_e32 v91, 1.0, v99, vcc | |
v_mul_f32_e32 v92, v91, v92 | |
v_rcp_f32_e32 v92, v92 | |
v_cmp_u_f32_e32 vcc, v79, v79 | |
v_mac_f32_e32 v51, v0, v67 | |
v_mul_f32_e32 v87, v92, v87 | |
v_mad_f32 v87, -v91, v87, 1.0 | |
v_madak_f32_e32 v91, v100, v94, 0x3f58560b | |
v_cndmask_b32_e64 v87, 1.0, v87, s[10:11] | |
v_cndmask_b32_e64 v87, v87, v91, s[4:5] | |
v_and_b32_e32 v91, s52, v79 | |
v_or_b32_e32 v87, v91, v87 | |
v_mad_f32 v91, v103, v79, v79 | |
v_cndmask_b32_e64 v87, v87, v91, s[8:9] | |
v_mul_f32_e32 v91, 0x3f8375d4, v79 | |
v_mac_f32_e32 v91, 0x41000000, v79 | |
v_mul_f32_e32 v91, 0x3e000000, v91 | |
v_cndmask_b32_e64 v87, v87, v91, s[12:13] | |
v_cndmask_b32_e32 v79, v87, v79, vcc | |
v_subrev_f32_e32 v79, v79, v85 | |
v_mul_f32_e64 v85, s19, -v85 | |
v_mac_f32_e32 v85, v79, v84 | |
v_mad_f32 v79, v83, v88, -v82 | |
v_mul_f32_e32 v82, v89, v86 | |
v_mul_f32_e32 v82, v88, v82 | |
v_mul_f32_e32 v83, v79, v82 | |
v_mac_f32_e32 v83, v90, v76 | |
v_mac_f32_e32 v5, v85, v76 | |
v_mad_f32 v50, v78, v83, v50 | |
v_mad_f32 v49, v80, v83, v49 | |
v_mad_f32 v48, v81, v83, v48 | |
v_mul_f32_e64 v82, v83, -v78 | |
v_mul_f32_e64 v79, v83, -v80 | |
v_mul_f32_e64 v76, v83, -v81 | |
v_mul_f32_e64 v86, v67, -v0 | |
BB7_58: ; %Flow1242 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[38:39] | |
BB7_59: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
v_lshrrev_b32_e32 v67, 9, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_63 | |
s_cbranch_execz BB7_63 | |
BB7_60: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:16 offset1:17 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v64, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_62 | |
s_cbranch_execz BB7_62 | |
BB7_61: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 9, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:64 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v85 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v87, v75, v90 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v85 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v85, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v85 | |
v_mul_f32_e32 v90, v85, v85 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v85 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v85 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v85 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v85, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v85, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v82, -v84, v81, v82 | |
v_mad_f32 v44, v81, v84, v44 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v79, -v84, v80, v79 | |
v_mad_f32 v43, v80, v84, v43 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v78, v76 | |
v_mad_f32 v42, v78, v84, v42 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v85|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v85 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v85 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v85 | |
v_mad_f32 v85, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v85, 1.0, v85, vcc | |
v_cndmask_b32_e64 v85, v85, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v85, v90, v85 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v85, v85, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v85, v85, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v85, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v85, s19, -v88 | |
v_mac_f32_e32 v85, v83, v89 | |
v_mac_f32_e32 v5, v85, v87 | |
v_mac_f32_e32 v45, v0, v67 | |
BB7_62: ; %Flow1241 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_63: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 10, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_67 | |
s_cbranch_execz BB7_67 | |
BB7_64: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:32 offset1:33 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v63, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_66 | |
s_cbranch_execz BB7_66 | |
BB7_65: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 10, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:128 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v85 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v87, v75, v90 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v85 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v85, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v85 | |
v_mul_f32_e32 v90, v85, v85 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v85 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v85 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v85 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v85, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v85, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v82, -v84, v81, v82 | |
v_mad_f32 v34, v81, v84, v34 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v79, -v84, v80, v79 | |
v_mad_f32 v33, v80, v84, v33 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v78, v76 | |
v_mad_f32 v32, v78, v84, v32 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v85|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v85 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v85 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v85 | |
v_mad_f32 v85, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v85, 1.0, v85, vcc | |
v_cndmask_b32_e64 v85, v85, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v85, v90, v85 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v85, v85, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v85, v85, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v85, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v85, s19, -v88 | |
v_mac_f32_e32 v85, v83, v89 | |
v_mac_f32_e32 v5, v85, v87 | |
v_mac_f32_e32 v35, v0, v67 | |
BB7_66: ; %Flow1240 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_67: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 11, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_71 | |
s_cbranch_execz BB7_71 | |
BB7_68: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:48 offset1:49 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v62, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_70 | |
s_cbranch_execz BB7_70 | |
BB7_69: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 11, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:192 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v85 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v87, v75, v90 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v85 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v85, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v85 | |
v_mul_f32_e32 v90, v85, v85 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v85 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v85 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v85 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v85, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v85, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v82, -v84, v81, v82 | |
v_mad_f32 v30, v81, v84, v30 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v79, -v84, v80, v79 | |
v_mad_f32 v29, v80, v84, v29 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v78, v76 | |
v_mad_f32 v28, v78, v84, v28 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v85|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v85 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v85 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v85 | |
v_mad_f32 v85, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v85, 1.0, v85, vcc | |
v_cndmask_b32_e64 v85, v85, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v85, v90, v85 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v85, v85, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v85, v85, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v85, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v85, s19, -v88 | |
v_mac_f32_e32 v85, v83, v89 | |
v_mac_f32_e32 v5, v85, v87 | |
v_mac_f32_e32 v31, v0, v67 | |
BB7_70: ; %Flow1239 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_71: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 12, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_75 | |
s_cbranch_execz BB7_75 | |
BB7_72: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:64 offset1:65 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v61, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_74 | |
s_cbranch_execz BB7_74 | |
BB7_73: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 12, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:256 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v85 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v87, v75, v90 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v85 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v85, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v85 | |
v_mul_f32_e32 v90, v85, v85 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v85 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v85 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v85 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v85, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v85, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v82, -v84, v81, v82 | |
v_mad_f32 v26, v81, v84, v26 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v79, -v84, v80, v79 | |
v_mad_f32 v25, v80, v84, v25 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v78, v76 | |
v_mad_f32 v24, v78, v84, v24 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v85|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v85 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v85 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v85 | |
v_mad_f32 v85, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v85, 1.0, v85, vcc | |
v_cndmask_b32_e64 v85, v85, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v85, v90, v85 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v85, v85, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v85, v85, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v85, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v85, s19, -v88 | |
v_mac_f32_e32 v85, v83, v89 | |
v_mac_f32_e32 v5, v85, v87 | |
v_mac_f32_e32 v27, v0, v67 | |
BB7_74: ; %Flow1238 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_75: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 13, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_79 | |
s_cbranch_execz BB7_79 | |
BB7_76: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:80 offset1:81 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v60, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_78 | |
s_cbranch_execz BB7_78 | |
BB7_77: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 13, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:320 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v85 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v87, v75, v90 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v85 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v85, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v85 | |
v_mul_f32_e32 v90, v85, v85 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v85 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v85 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v85 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v85, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v85, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v82, -v84, v81, v82 | |
v_mad_f32 v22, v81, v84, v22 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v79, -v84, v80, v79 | |
v_mad_f32 v21, v80, v84, v21 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v78, v76 | |
v_mad_f32 v20, v78, v84, v20 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v85|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v85 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v85 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v85 | |
v_mad_f32 v85, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v85, 1.0, v85, vcc | |
v_cndmask_b32_e64 v85, v85, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v85, v90, v85 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v85, v85, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v85, v85, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v85, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v85, s19, -v88 | |
v_mac_f32_e32 v85, v83, v89 | |
v_mac_f32_e32 v5, v85, v87 | |
v_mac_f32_e32 v23, v0, v67 | |
BB7_78: ; %Flow1237 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_79: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 14, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_83 | |
s_cbranch_execz BB7_83 | |
BB7_80: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:96 offset1:97 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v59, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_82 | |
s_cbranch_execz BB7_82 | |
BB7_81: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 14, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:384 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v85 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v87, v75, v90 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v85 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v85, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v85 | |
v_mul_f32_e32 v90, v85, v85 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v85 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v85 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v85 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v85, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v85, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v82, -v84, v81, v82 | |
v_mad_f32 v18, v81, v84, v18 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v79, -v84, v80, v79 | |
v_mad_f32 v17, v80, v84, v17 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v78, v76 | |
v_mad_f32 v16, v78, v84, v16 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v85|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v85 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v85 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v85 | |
v_mad_f32 v85, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v85, 1.0, v85, vcc | |
v_cndmask_b32_e64 v85, v85, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v85, v90, v85 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v85, v85, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v85, v85, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v85, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v85, s19, -v88 | |
v_mac_f32_e32 v85, v83, v89 | |
v_mac_f32_e32 v5, v85, v87 | |
v_mac_f32_e32 v19, v0, v67 | |
BB7_82: ; %Flow1236 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_83: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 15, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_87 | |
s_cbranch_execz BB7_87 | |
BB7_84: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:112 offset1:113 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v38, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v73, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v72, v72, v87 | |
v_mul_f32_e32 v77, v73, v73 | |
v_cndmask_b32_e64 v78, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v74, v74, v89 | |
v_mac_f32_e32 v77, v72, v72 | |
v_mac_f32_e32 v77, v74, v74 | |
v_mul_f32_e32 v78, s26, v78 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v77, v78 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_86 | |
s_cbranch_execz BB7_86 | |
BB7_85: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b64 v[80:81], v56 offset:448 | |
v_max_f32_e32 v77, 0x34cd15ae, v77 | |
v_mad_f32 v83, -v67, v0, v86 | |
v_rsq_f32_e32 v83, v77 | |
v_lshrrev_b32_e32 v78, 15, v65 | |
v_and_b32_e32 v78, 1, v78 | |
v_cmp_eq_u32_e32 vcc, 1, v78 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v71, v71, v81 | |
v_mul_f32_e32 v81, v83, v83 | |
v_cndmask_b32_e64 v78, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v81, v81 | |
v_mul_f32_e32 v84, v78, v84 | |
v_mul_f32_e32 v85, v81, v84 | |
v_mad_f32 v84, v84, v81, s42 | |
v_mad_f32 v86, v85, v85, s43 | |
v_cmp_gt_f32_e32 vcc, s27, v77 | |
v_mul_f32_e32 v70, v70, v80 | |
v_mul_f32_e32 v84, 0xbe2aaaab, v84 | |
v_mul_f32_e32 v84, v70, v84 | |
v_mul_f32_e32 v86, v86, v71 | |
v_cndmask_b32_e64 v80, 0, 1.0, vcc | |
v_mac_f32_e32 v84, 0x3daaaaaa, v86 | |
v_mul_f32_e32 v86, v78, v80 | |
v_mac_f32_e32 v8, v84, v86 | |
v_mul_f32_e32 v86, v9, v77 | |
v_mul_f32_e32 v87, v86, v86 | |
v_mov_b32_e32 v88, 0x3a92b707 | |
v_madak_f32_e32 v88, v88, v87, 0x3ded3cb2 | |
v_mov_b32_e32 v89, 0x3c739487 | |
v_madak_f32_e32 v89, v89, v87, 0x3f01e2bc | |
v_mad_f32 v88, v88, v87, 1.0 | |
v_mac_f32_e32 v88, v86, v89 | |
v_mov_b32_e32 v89, 0xb2951928 | |
v_madak_f32_e32 v89, v89, v87, 0xb85ffb93 | |
v_mul_f32_e32 v75, v75, v90 | |
v_mov_b32_e32 v90, 0x35c55945 | |
v_madak_f32_e32 v90, v90, v87, 0x3a83ca0c | |
v_madak_f32_e32 v89, v89, v87, 0xbc9ded90 | |
v_madak_f32_e32 v90, v90, v87, 0x3d8eaf3b | |
v_madak_f32_e32 v87, v89, v87, 0xbf409397 | |
v_mac_f32_e32 v87, v86, v90 | |
v_rcp_f32_e32 v86, v88 | |
v_mul_f32_e32 v77, s18, v77 | |
v_mul_f32_e32 v77, v83, v77 | |
v_mul_f32_e32 v84, v78, v81 | |
v_mul_f32_e32 v86, v37, v86 | |
v_mul_f32_e32 v86, v87, v86 | |
v_mac_f32_e32 v86, v83, v84 | |
v_and_b32_e32 v84, s50, v77 | |
v_mov_b32_e32 v87, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v87, v84 | |
v_mul_f32_e32 v87, v84, v84 | |
v_rcp_f32_e32 v88, v87 | |
v_add_f32_e32 v89, -1.0, v84 | |
v_mov_b32_e32 v90, 0xbd777f97 | |
v_mov_b32_e32 v91, 0x6f800000 | |
v_cndmask_b32_e64 v88, v88, v89, s[4:5] | |
v_mov_b32_e32 v89, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v89, v84 | |
v_cndmask_b32_e64 v87, v88, v87, s[8:9] | |
v_mov_b32_e32 v89, 0xc11d077e | |
v_mov_b32_e32 v88, 0x4036db6e | |
v_madak_f32_e32 v89, v89, v87, 0xc2a2932b | |
v_cmp_gt_f32_e32 vcc, v88, v84 | |
v_mov_b32_e32 v88, 0xc3f1c275 | |
v_madak_f32_e32 v88, v88, v87, 0xc480230b | |
v_madak_f32_e32 v89, v87, v89, 0xc3389ae7 | |
v_madak_f32_e32 v88, v87, v88, 0xc41f6441 | |
v_madak_f32_e32 v89, v87, v89, 0xc322658c | |
v_madak_f32_e32 v88, v87, v88, 0xc320a2ea | |
v_madak_f32_e32 v89, v87, v89, 0xc2798057 | |
v_madak_f32_e32 v88, v87, v88, 0xc18e104b | |
v_madak_f32_e32 v89, v87, v89, 0xc128f022 | |
v_madak_f32_e32 v88, v87, v88, 0xbf4c9dd4 | |
v_madak_f32_e32 v89, v87, v89, 0xbf31a0b7 | |
v_madak_f32_e32 v88, v87, v88, 0xbc21a092 | |
v_madak_f32_e32 v89, v87, v89, 0xbc21a093 | |
v_madak_f32_e32 v90, v90, v87, 0x40d23f7c | |
v_cndmask_b32_e32 v88, v88, v89, vcc | |
v_mov_b32_e32 v89, 0xc1b38712 | |
v_madak_f32_e32 v89, v89, v87, 0x43ed43a7 | |
v_madak_f32_e32 v90, v87, v90, 0x42d9451f | |
v_madak_f32_e32 v89, v87, v89, 0x451f90ce | |
v_madak_f32_e32 v90, v87, v90, 0x43d6810b | |
v_madak_f32_e32 v89, v87, v89, 0x4547fdbb | |
v_madak_f32_e32 v90, v87, v90, 0x442158c9 | |
v_madak_f32_e32 v89, v87, v89, 0x44c01759 | |
v_madak_f32_e32 v90, v87, v90, 0x43d9486f | |
v_madak_f32_e32 v89, v87, v89, 0x43a2e571 | |
v_madak_f32_e32 v90, v87, v90, 0x4309a863 | |
v_madak_f32_e32 v89, v87, v89, 0x41f2b459 | |
v_madak_f32_e32 v90, v87, v90, 0x419d35ce | |
v_cndmask_b32_e32 v89, v89, v90, vcc | |
v_mov_b32_e32 v90, 0xbb0df9c0 | |
v_madak_f32_e32 v90, v90, v87, 0x3d1151b3 | |
v_madak_f32_e32 v90, v87, v90, 0xbde31cc2 | |
v_madak_f32_e32 v90, v87, v90, 0x3ea2fe54 | |
v_madak_f32_e32 v90, v87, v90, 0xbebe9208 | |
v_madak_f32_e32 v90, v87, v90, 0x3ed46805 | |
v_madak_f32_e32 v90, v87, v90, 0xbb1acdc6 | |
v_cndmask_b32_e64 v88, v88, v90, s[4:5] | |
v_mov_b32_e32 v90, 0x3c445aa3 | |
v_madak_f32_e32 v90, v90, v87, 0x3c5f6e13 | |
v_madak_f32_e32 v90, v87, v90, 0x3e013307 | |
v_madak_f32_e32 v90, v87, v90, 0x3d931ae7 | |
v_madak_f32_e32 v90, v87, v90, 0x3f0a5785 | |
v_madak_f32_e32 v90, v87, v90, 0x3dd9f331 | |
v_cndmask_b32_e64 v89, v89, v90, s[4:5] | |
v_mov_b32_e32 v90, 0xb684e21a | |
v_madak_f32_e32 v90, v90, v87, 0x390aee49 | |
v_madak_f32_e32 v90, v87, v90, 0x3ba68116 | |
v_madak_f32_e32 v90, v87, v90, 0x3d852a63 | |
v_madak_f32_e32 v90, v87, v90, 0x3ecbbbce | |
v_cndmask_b32_e64 v89, v89, v90, s[8:9] | |
v_mad_f32 v89, v87, v89, 1.0 | |
v_and_b32_e32 v94, s51, v77 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_cmp_gt_f32_e64 vcc, |v89|, v91 | |
v_mov_b32_e32 v92, 0x2f800000 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v93, 1.0, v92, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_mov_b32_e32 v90, 0xb7c756b1 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_madak_f32_e32 v90, v90, v87, 0xbbbd1489 | |
v_madak_f32_e32 v90, v87, v90, 0xbce9528f | |
v_madak_f32_e32 v90, v87, v90, 0xbea66beb | |
v_mul_f32_e32 v89, v93, v89 | |
v_madak_f32_e32 v87, v87, v90, 0x3e0375d4 | |
v_rcp_f32_e32 v89, v89 | |
v_cvt_f32_i32_e32 v90, v96 | |
v_cndmask_b32_e64 v87, v88, v87, s[8:9] | |
v_mov_b32_e32 v88, 0xbf317180 | |
v_mul_f32_e32 v87, v89, v87 | |
v_mad_f32 v89, v88, v90, v95 | |
v_mov_b32_e32 v98, 0xb717f7d1 | |
v_mad_f32 v99, v98, v90, v89 | |
v_mul_f32_e32 v100, v99, v99 | |
v_mov_b32_e32 v101, 0xb5ddea0e | |
v_mov_b32_e32 v102, 0x3331bb4c | |
v_mad_f32 v103, v102, v100, v101 | |
v_mov_b32_e32 v104, 0x388ab355 | |
v_mad_f32 v103, v103, v100, v104 | |
v_mov_b32_e32 v105, 0xbb360b61 | |
v_mad_f32 v103, v103, v100, v105 | |
v_mov_b32_e32 v106, 0x3e2aaaab | |
v_mad_f32 v103, v103, v100, v106 | |
v_mad_f32 v100, -v100, v103, v99 | |
v_mul_f32_e32 v99, v100, v99 | |
v_sub_f32_e32 v100, 2.0, v100 | |
v_cmp_gt_f32_e64 vcc, |v100|, v91 | |
v_cndmask_b32_e32 v103, 1.0, v92, vcc | |
v_mul_f32_e64 v100, v100, -v103 | |
v_rcp_f32_e32 v100, v100 | |
v_mad_f32 v70, v71, v85, -v70 | |
v_mul_f32_e32 v71, v80, v81 | |
v_mul_f32_e32 v71, v85, v71 | |
v_mul_f32_e32 v99, v100, v99 | |
v_mul_f32_e32 v99, v99, v103 | |
v_mad_f32 v90, -v90, v98, v99 | |
v_subrev_f32_e32 v89, v89, v90 | |
v_lshlrev_b32_e32 v90, 23, v96 | |
v_sub_f32_e32 v89, 1.0, v89 | |
v_add_i32_e32 v89, vcc, v89, v90 | |
v_mov_b32_e32 v90, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v90 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v89, 0, v89, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v99, 0x7f800000 | |
v_cndmask_b32_e32 v89, v99, v89, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v89, v89, v95, vcc | |
v_subrev_f32_e32 v95, v84, v94 | |
v_mul_f32_e32 v100, v87, v93 | |
v_add_f32_e32 v94, v84, v94 | |
v_mad_f32 v94, v94, v95, v100 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v87, v93, v87, 0x3f58560b | |
v_mul_f32_e32 v70, v70, v71 | |
v_mac_f32_e32 v70, v86, v75 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v82, -v70, v74, v82 | |
v_mad_f32 v15, v74, v70, v15 | |
v_mad_f32 v88, v88, v97, v94 | |
v_mad_f32 v103, v98, v97, v88 | |
v_mul_f32_e32 v107, v103, v103 | |
v_mac_f32_e32 v101, v102, v107 | |
v_mac_f32_e32 v104, v101, v107 | |
v_mac_f32_e32 v105, v104, v107 | |
v_mac_f32_e32 v106, v105, v107 | |
v_mad_f32 v101, -v107, v106, v103 | |
v_mul_f32_e32 v102, v101, v103 | |
v_sub_f32_e32 v101, 2.0, v101 | |
v_cmp_gt_f32_e64 vcc, |v101|, v91 | |
v_cndmask_b32_e32 v103, 1.0, v92, vcc | |
v_mul_f32_e64 v101, v101, -v103 | |
v_rcp_f32_e32 v101, v101 | |
v_mad_f32 v79, -v70, v73, v79 | |
v_mad_f32 v14, v73, v70, v14 | |
v_mad_f32 v76, -v70, v72, v76 | |
v_mul_f32_e32 v101, v101, v102 | |
v_mul_f32_e32 v101, v101, v103 | |
v_mad_f32 v97, -v97, v98, v101 | |
v_subrev_f32_e32 v88, v88, v97 | |
v_sub_f32_e32 v88, 1.0, v88 | |
v_add_i32_e32 v88, vcc, v88, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v90 | |
v_cndmask_b32_e32 v88, 0, v88, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v88, v99, v88, vcc | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v88, v88, v94, vcc | |
v_cmp_gt_f32_e64 vcc, |v84|, v91 | |
v_mul_f32_e32 v88, v88, v89 | |
v_cndmask_b32_e32 v89, 1.0, v92, vcc | |
v_mul_f32_e32 v90, v89, v84 | |
v_rcp_f32_e32 v90, v90 | |
v_mov_b32_e32 v91, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v91, v84 | |
v_mov_b32_e32 v91, 0x31800000 | |
v_cmp_gt_f32_e64 s[10:11], v91, v84 | |
v_mul_f32_e32 v84, v90, v88 | |
v_mad_f32 v84, -v89, v84, 1.0 | |
v_cndmask_b32_e32 v84, 1.0, v84, vcc | |
v_cndmask_b32_e64 v84, v84, v87, s[4:5] | |
v_and_b32_e32 v87, s52, v77 | |
v_or_b32_e32 v84, v87, v84 | |
v_mad_f32 v87, v100, v77, v77 | |
v_cndmask_b32_e64 v84, v84, v87, s[8:9] | |
v_mul_f32_e32 v87, 0x3f8375d4, v77 | |
v_mac_f32_e32 v87, 0x41000000, v77 | |
v_mul_f32_e32 v87, 0x3e000000, v87 | |
v_cndmask_b32_e64 v84, v84, v87, s[10:11] | |
v_cmp_u_f32_e32 vcc, v77, v77 | |
v_cndmask_b32_e32 v77, v84, v77, vcc | |
v_subrev_f32_e32 v77, v77, v78 | |
v_mul_f32_e64 v78, s19, -v78 | |
v_mac_f32_e32 v78, v77, v83 | |
v_mac_f32_e32 v5, v78, v75 | |
v_mad_f32 v13, v72, v70, v13 | |
v_mac_f32_e32 v68, v0, v67 | |
BB7_86: ; %Flow1235 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_87: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
s_mov_b32 m0, -1 | |
v_cmp_gt_i32_e32 vcc, 3, v2 | |
ds_write_b32 v6, v76 | |
ds_write_b32 v7, v79 | |
ds_write_b32 v12, v82 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[4:5], exec, s[4:5] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB7_93 | |
s_cbranch_execz BB7_93 | |
BB7_88: ; in Loop: Header=BB7_11 Depth=1 | |
v_lshlrev_b32_e32 v70, 6, v2 | |
v_add_i32_e32 v67, vcc, v11, v70 | |
v_lshlrev_b32_e32 v67, 2, v67 | |
v_add_i32_e32 v71, vcc, s15, v67 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v67, v71 | |
v_add_i32_e32 v72, vcc, 8, v11 | |
v_or_b32_e32 v73, 1, v11 | |
v_cmp_lt_i32_e32 vcc, v73, v72 | |
s_and_saveexec_b64 s[8:9], vcc | |
s_xor_b64 s[8:9], exec, s[8:9] | |
s_waitcnt lgkmcnt(0) | |
; mask branch BB7_90 | |
s_cbranch_execz BB7_90 | |
BB7_89: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b32 v[72:73], v71 offset0:1 offset1:2 | |
v_or_b32_e32 v76, 3, v11 | |
v_add_i32_e32 v70, vcc, v76, v70 | |
v_lshlrev_b32_e32 v70, 2, v70 | |
ds_read2_b32 v[74:75], v71 offset0:3 offset1:4 | |
v_add_i32_e32 v70, vcc, s15, v70 | |
ds_read_b32 v77, v71 offset:28 | |
ds_read2_b32 v[70:71], v70 offset0:2 offset1:3 | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v67, v67, v72 | |
v_add_f32_e32 v67, v73, v67 | |
v_add_f32_e32 v67, v74, v67 | |
v_add_f32_e32 v67, v75, v67 | |
v_add_f32_e32 v67, v70, v67 | |
v_add_f32_e32 v67, v71, v67 | |
v_add_f32_e32 v67, v77, v67 | |
BB7_90: ; %._crit_edge.i72 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
v_mul_lo_i32 v66, v66, 3 | |
v_mov_b32_e32 v74, s29 | |
s_mov_b64 s[8:9], s[28:29] | |
s_mov_b64 s[10:11], s[46:47] | |
v_add_i32_e32 v70, vcc, v66, v2 | |
v_ashrrev_i32_e32 v71, 31, v70 | |
v_lshl_b64 v[72:73], v[70:71], 2 | |
v_add_i32_e32 v70, vcc, s28, v72 | |
v_addc_u32_e32 v71, vcc, v73, v74, vcc | |
buffer_load_dword v73, v[72:73], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], 0 | |
s_waitcnt vmcnt(0) | |
BB7_91: ; Parent Loop BB7_11 Depth=1 | |
; => This Inner Loop Header: Depth=2 | |
v_add_f32_e32 v72, v67, v73 | |
v_mov_b32_e32 v75, v73 | |
v_mov_b32_e32 v74, v72 | |
buffer_atomic_cmpswap v[74:75], v[70:71], s[44:47], 0 addr64 glc | |
v_mov_b32_e32 v66, -1 | |
v_mov_b32_e32 v66, 0xf000 | |
s_waitcnt vmcnt(0) expcnt(0) | |
v_cmp_eq_u32_e32 vcc, v74, v73 | |
s_or_b64 s[8:9], vcc, s[8:9] | |
v_mov_b32_e32 v73, v74 | |
s_andn2_b64 exec, exec, s[8:9] | |
s_cbranch_execnz BB7_91 | |
; BB#92: ; %Flow1233 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[8:9] | |
BB7_93: ; %Flow1234 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[4:5] | |
BB7_94: ; %Flow1243 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[30:31] | |
v_and_b32_e32 v66, 0xff0000, v69 | |
v_cmp_ne_u32_e32 vcc, 0, v66 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[30:31], exec, s[4:5] | |
; mask branch BB7_134 | |
s_cbranch_execz BB7_134 | |
BB7_95: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read_b32 v77, v54 offset:8 | |
s_mov_b64 s[8:9], s[32:33] | |
s_mov_b64 s[10:11], s[46:47] | |
v_mov_b32_e32 v76, 0 | |
v_mov_b32_e32 v82, v76 | |
s_waitcnt lgkmcnt(0) | |
v_lshlrev_b32_e32 v66, 3, v77 | |
v_add_i32_e32 v66, vcc, v66, v1 | |
v_ashrrev_i32_e32 v67, 31, v66 | |
v_lshl_b64 v[70:71], v[66:67], 4 | |
v_lshl_b64 v[78:79], v[66:67], 3 | |
buffer_load_dwordx4 v[72:75], v[70:71], s[8:11], 0 addr64 | |
s_mov_b64 s[8:9], s[36:37] | |
buffer_load_dwordx2 v[70:71], v[78:79], s[8:11], 0 addr64 | |
v_lshrrev_b32_e32 v67, 16, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
v_mov_b32_e32 v79, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; implicit-def: %VGPR83_VGPR84_VGPR85_VGPR86 | |
s_waitcnt vmcnt(0) | |
; mask branch BB7_99 | |
s_cbranch_execz BB7_99 | |
BB7_96: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset1:1 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v41, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v81, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v76, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v78, v74, v89 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mul_f32_e32 v76, s26, v76 | |
v_cmp_lt_f32_e32 vcc, v83, v76 | |
v_mov_b32_e32 v76, 0 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_mov_b32_e32 v79, v76 | |
v_mov_b32_e32 v82, v76 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[38:39], exec, s[4:5] | |
; implicit-def: %VGPR83_VGPR84_VGPR85_VGPR86 | |
; mask branch BB7_98 | |
s_cbranch_execz BB7_98 | |
BB7_97: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v79, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v84, v79 | |
v_lshrrev_b32_e32 v82, 16, v65 | |
v_and_b32_e32 v82, 1, v82 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v82 | |
ds_read_b64 v[82:83], v56 | |
v_mul_f32_e32 v86, v84, v84 | |
v_cndmask_b32_e64 v85, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v87, v86, v86 | |
v_mul_f32_e32 v87, v85, v87 | |
v_mul_f32_e32 v88, v86, v87 | |
v_mad_f32 v87, v87, v86, s42 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v82, v70, v82 | |
v_mul_f32_e32 v87, 0xbe2aaaab, v87 | |
v_mul_f32_e32 v83, v71, v83 | |
v_mad_f32 v89, v88, v88, s43 | |
v_cmp_gt_f32_e32 vcc, s27, v79 | |
v_mul_f32_e32 v89, v89, v83 | |
v_mul_f32_e32 v87, v82, v87 | |
v_mac_f32_e32 v87, 0x3daaaaaa, v89 | |
v_cndmask_b32_e64 v89, 0, 1.0, vcc | |
v_mul_f32_e32 v76, v75, v90 | |
v_mul_f32_e32 v90, v85, v89 | |
v_mac_f32_e32 v8, v87, v90 | |
v_mul_f32_e32 v87, v9, v79 | |
v_mul_f32_e32 v90, v87, v87 | |
v_mov_b32_e32 v91, 0x3a92b707 | |
v_madak_f32_e32 v91, v91, v90, 0x3ded3cb2 | |
v_mov_b32_e32 v92, 0x3c739487 | |
v_madak_f32_e32 v92, v92, v90, 0x3f01e2bc | |
v_mad_f32 v91, v91, v90, 1.0 | |
v_mac_f32_e32 v91, v87, v92 | |
v_mov_b32_e32 v92, 0xb2951928 | |
v_madak_f32_e32 v92, v92, v90, 0xb85ffb93 | |
v_mov_b32_e32 v93, 0x35c55945 | |
v_mul_f32_e32 v79, s18, v79 | |
v_madak_f32_e32 v93, v93, v90, 0x3a83ca0c | |
v_madak_f32_e32 v92, v92, v90, 0xbc9ded90 | |
v_mul_f32_e32 v79, v84, v79 | |
v_madak_f32_e32 v93, v93, v90, 0x3d8eaf3b | |
v_madak_f32_e32 v90, v92, v90, 0xbf409397 | |
v_and_b32_e32 v92, s50, v79 | |
v_mov_b32_e32 v94, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v94, v92 | |
v_mul_f32_e32 v94, v92, v92 | |
v_rcp_f32_e32 v95, v94 | |
v_add_f32_e32 v96, -1.0, v92 | |
v_mov_b32_e32 v98, 0xbd777f97 | |
v_mov_b32_e32 v99, 0x4036db6e | |
v_cndmask_b32_e64 v95, v95, v96, s[4:5] | |
v_mov_b32_e32 v96, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v96, v92 | |
v_cndmask_b32_e64 v94, v95, v94, s[8:9] | |
v_mov_b32_e32 v96, 0xc1b38712 | |
v_madak_f32_e32 v98, v98, v94, 0x40d23f7c | |
v_madak_f32_e32 v96, v96, v94, 0x43ed43a7 | |
v_madak_f32_e32 v98, v94, v98, 0x42d9451f | |
v_madak_f32_e32 v96, v94, v96, 0x451f90ce | |
v_madak_f32_e32 v98, v94, v98, 0x43d6810b | |
v_madak_f32_e32 v96, v94, v96, 0x4547fdbb | |
v_madak_f32_e32 v98, v94, v98, 0x442158c9 | |
v_madak_f32_e32 v96, v94, v96, 0x44c01759 | |
v_madak_f32_e32 v98, v94, v98, 0x43d9486f | |
v_madak_f32_e32 v96, v94, v96, 0x43a2e571 | |
v_madak_f32_e32 v98, v94, v98, 0x4309a863 | |
v_mov_b32_e32 v97, 0xc11d077e | |
v_cmp_gt_f32_e32 vcc, v99, v92 | |
v_madak_f32_e32 v96, v94, v96, 0x41f2b459 | |
v_madak_f32_e32 v98, v94, v98, 0x419d35ce | |
v_mov_b32_e32 v95, 0xc3f1c275 | |
v_madak_f32_e32 v97, v97, v94, 0xc2a2932b | |
v_cndmask_b32_e32 v96, v96, v98, vcc | |
v_mov_b32_e32 v98, 0x3c445aa3 | |
v_madak_f32_e32 v95, v95, v94, 0xc480230b | |
v_madak_f32_e32 v97, v94, v97, 0xc3389ae7 | |
v_madak_f32_e32 v98, v98, v94, 0x3c5f6e13 | |
v_madak_f32_e32 v95, v94, v95, 0xc41f6441 | |
v_madak_f32_e32 v97, v94, v97, 0xc322658c | |
v_madak_f32_e32 v98, v94, v98, 0x3e013307 | |
v_madak_f32_e32 v95, v94, v95, 0xc320a2ea | |
v_madak_f32_e32 v97, v94, v97, 0xc2798057 | |
v_madak_f32_e32 v98, v94, v98, 0x3d931ae7 | |
v_madak_f32_e32 v95, v94, v95, 0xc18e104b | |
v_madak_f32_e32 v97, v94, v97, 0xc128f022 | |
v_madak_f32_e32 v98, v94, v98, 0x3f0a5785 | |
v_madak_f32_e32 v95, v94, v95, 0xbf4c9dd4 | |
v_madak_f32_e32 v97, v94, v97, 0xbf31a0b7 | |
v_madak_f32_e32 v98, v94, v98, 0x3dd9f331 | |
v_cndmask_b32_e64 v96, v96, v98, s[4:5] | |
v_mov_b32_e32 v98, 0xb684e21a | |
v_madak_f32_e32 v95, v94, v95, 0xbc21a092 | |
v_madak_f32_e32 v97, v94, v97, 0xbc21a093 | |
v_madak_f32_e32 v98, v98, v94, 0x390aee49 | |
v_cndmask_b32_e32 v95, v95, v97, vcc | |
v_mov_b32_e32 v97, 0xbb0df9c0 | |
v_madak_f32_e32 v97, v97, v94, 0x3d1151b3 | |
v_madak_f32_e32 v98, v94, v98, 0x3ba68116 | |
v_madak_f32_e32 v97, v94, v97, 0xbde31cc2 | |
v_madak_f32_e32 v98, v94, v98, 0x3d852a63 | |
v_madak_f32_e32 v97, v94, v97, 0x3ea2fe54 | |
v_madak_f32_e32 v98, v94, v98, 0x3ecbbbce | |
v_cndmask_b32_e64 v96, v96, v98, s[8:9] | |
v_madak_f32_e32 v97, v94, v97, 0xbebe9208 | |
v_madak_f32_e32 v97, v94, v97, 0x3ed46805 | |
v_mad_f32 v96, v94, v96, 1.0 | |
v_mov_b32_e32 v98, 0x6f800000 | |
v_madak_f32_e32 v97, v94, v97, 0xbb1acdc6 | |
v_cmp_gt_f32_e64 vcc, |v96|, v98 | |
v_mov_b32_e32 v99, 0x2f800000 | |
v_cndmask_b32_e64 v95, v95, v97, s[4:5] | |
v_mov_b32_e32 v97, 0xb7c756b1 | |
v_cndmask_b32_e32 v100, 1.0, v99, vcc | |
v_madak_f32_e32 v97, v97, v94, 0xbbbd1489 | |
v_mul_f32_e32 v96, v100, v96 | |
v_madak_f32_e32 v97, v94, v97, 0xbce9528f | |
v_rcp_f32_e32 v96, v96 | |
v_madak_f32_e32 v97, v94, v97, 0xbea66beb | |
v_madak_f32_e32 v94, v94, v97, 0x3e0375d4 | |
v_cndmask_b32_e64 v94, v95, v94, s[8:9] | |
v_mul_f32_e32 v94, v96, v94 | |
v_and_b32_e32 v95, s51, v79 | |
v_mov_b32_e32 v96, 0xbf100000 | |
v_mad_f32 v96, v95, -v95, v96 | |
v_cmp_gt_f32_e32 vcc, 0, v96 | |
v_cndmask_b32_e64 v97, 0.5, -0.5, vcc | |
v_mov_b32_e32 v101, 0x3fb8aa3b | |
v_mac_f32_e32 v97, v101, v96 | |
v_cvt_i32_f32_e32 v97, v97 | |
v_subrev_f32_e32 v115, v92, v95 | |
v_mul_f32_e32 v103, v94, v100 | |
v_add_f32_e32 v95, v92, v95 | |
v_mad_f32 v95, v95, v115, v103 | |
v_cmp_gt_f32_e64 s[10:11], 0, v95 | |
v_cndmask_b32_e64 v115, 0.5, -0.5, s[10:11] | |
v_cvt_f32_i32_e32 v102, v97 | |
v_mac_f32_e32 v115, v101, v95 | |
v_cvt_i32_f32_e32 v101, v115 | |
v_mov_b32_e32 v104, 0xbf317180 | |
v_mad_f32 v105, v104, v102, v96 | |
v_mov_b32_e32 v106, 0xb717f7d1 | |
v_mad_f32 v107, v106, v102, v105 | |
v_mul_f32_e32 v108, v107, v107 | |
v_mov_b32_e32 v109, 0xb5ddea0e | |
v_mov_b32_e32 v110, 0x3331bb4c | |
v_cvt_f32_i32_e32 v116, v101 | |
v_mad_f32 v111, v110, v108, v109 | |
v_mov_b32_e32 v112, 0x388ab355 | |
v_mad_f32 v111, v111, v108, v112 | |
v_mov_b32_e32 v113, 0xbb360b61 | |
v_mad_f32 v111, v111, v108, v113 | |
v_mov_b32_e32 v114, 0x3e2aaaab | |
v_mad_f32 v111, v111, v108, v114 | |
v_mac_f32_e32 v90, v87, v93 | |
v_mad_f32 v93, v104, v116, v95 | |
v_mad_f32 v108, -v108, v111, v107 | |
v_mad_f32 v104, v106, v116, v93 | |
v_mul_f32_e32 v87, v108, v107 | |
v_mul_f32_e32 v107, v104, v104 | |
v_sub_f32_e32 v111, 2.0, v108 | |
v_mac_f32_e32 v109, v110, v107 | |
v_cmp_gt_f32_e64 vcc, |v111|, v98 | |
v_mac_f32_e32 v112, v109, v107 | |
v_cndmask_b32_e32 v115, 1.0, v99, vcc | |
v_mac_f32_e32 v113, v112, v107 | |
v_mul_f32_e64 v111, v111, -v115 | |
v_mac_f32_e32 v114, v113, v107 | |
v_rcp_f32_e32 v111, v111 | |
v_mad_f32 v107, -v107, v114, v104 | |
v_sub_f32_e32 v108, 2.0, v107 | |
v_rcp_f32_e32 v91, v91 | |
v_cmp_gt_f32_e64 vcc, |v108|, v98 | |
v_cndmask_b32_e32 v109, 1.0, v99, vcc | |
v_mul_f32_e32 v87, v111, v87 | |
v_mul_f32_e64 v108, v108, -v109 | |
v_mul_f32_e32 v87, v87, v115 | |
v_rcp_f32_e32 v108, v108 | |
v_mad_f32 v87, -v102, v106, v87 | |
v_mul_f32_e32 v91, v37, v91 | |
v_subrev_f32_e32 v87, v105, v87 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v85, v86 | |
v_mul_f32_e32 v102, v107, v104 | |
v_mac_f32_e32 v90, v84, v91 | |
v_lshlrev_b32_e32 v91, 23, v97 | |
v_sub_f32_e32 v87, 1.0, v87 | |
v_mul_f32_e32 v102, v108, v102 | |
v_add_i32_e32 v87, vcc, v87, v91 | |
v_mov_b32_e32 v91, 0xc2aeac4f | |
v_mul_f32_e32 v102, v102, v109 | |
v_cmp_nlt_f32_e32 vcc, v96, v91 | |
v_mov_b32_e32 v97, 0x42b17218 | |
v_mad_f32 v102, -v116, v106, v102 | |
v_cndmask_b32_e32 v87, 0, v87, vcc | |
v_cmp_lt_f32_e32 vcc, v96, v97 | |
v_mov_b32_e32 v104, 0x7f800000 | |
v_subrev_f32_e32 v93, v93, v102 | |
v_cndmask_b32_e32 v87, v104, v87, vcc | |
v_cmp_u_f32_e32 vcc, v96, v96 | |
v_cndmask_b32_e32 v87, v87, v96, vcc | |
v_sub_f32_e32 v93, 1.0, v93 | |
v_lshlrev_b32_e32 v96, 23, v101 | |
v_add_i32_e32 v93, vcc, v93, v96 | |
v_cmp_nlt_f32_e32 vcc, v95, v91 | |
v_cndmask_b32_e32 v91, 0, v93, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v97 | |
v_cndmask_b32_e32 v91, v104, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_mul_f32_e32 v87, v91, v87 | |
v_mov_b32_e32 v91, 0x40c00000 | |
v_cmp_gt_f32_e64 s[10:11], v91, v92 | |
v_mov_b32_e32 v91, 0x31800000 | |
v_cmp_gt_f32_e64 vcc, |v92|, v98 | |
v_cmp_gt_f32_e64 s[12:13], v91, v92 | |
v_cndmask_b32_e32 v91, 1.0, v99, vcc | |
v_mul_f32_e32 v92, v91, v92 | |
v_rcp_f32_e32 v92, v92 | |
v_cmp_u_f32_e32 vcc, v79, v79 | |
v_mac_f32_e32 v51, v0, v67 | |
v_mul_f32_e32 v87, v92, v87 | |
v_mad_f32 v87, -v91, v87, 1.0 | |
v_madak_f32_e32 v91, v100, v94, 0x3f58560b | |
v_cndmask_b32_e64 v87, 1.0, v87, s[10:11] | |
v_cndmask_b32_e64 v87, v87, v91, s[4:5] | |
v_and_b32_e32 v91, s52, v79 | |
v_or_b32_e32 v87, v91, v87 | |
v_mad_f32 v91, v103, v79, v79 | |
v_cndmask_b32_e64 v87, v87, v91, s[8:9] | |
v_mul_f32_e32 v91, 0x3f8375d4, v79 | |
v_mac_f32_e32 v91, 0x41000000, v79 | |
v_mul_f32_e32 v91, 0x3e000000, v91 | |
v_cndmask_b32_e64 v87, v87, v91, s[12:13] | |
v_cndmask_b32_e32 v79, v87, v79, vcc | |
v_subrev_f32_e32 v79, v79, v85 | |
v_mul_f32_e64 v85, s19, -v85 | |
v_mac_f32_e32 v85, v79, v84 | |
v_mad_f32 v79, v83, v88, -v82 | |
v_mul_f32_e32 v82, v89, v86 | |
v_mul_f32_e32 v82, v88, v82 | |
v_mul_f32_e32 v83, v79, v82 | |
v_mac_f32_e32 v83, v90, v76 | |
v_mac_f32_e32 v5, v85, v76 | |
v_mad_f32 v50, v78, v83, v50 | |
v_mad_f32 v49, v80, v83, v49 | |
v_mad_f32 v48, v81, v83, v48 | |
v_mul_f32_e64 v82, v83, -v78 | |
v_mul_f32_e64 v79, v83, -v80 | |
v_mul_f32_e64 v76, v83, -v81 | |
v_mul_f32_e64 v86, v67, -v0 | |
BB7_98: ; %Flow1231 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[38:39] | |
BB7_99: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
v_lshrrev_b32_e32 v67, 17, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_103 | |
s_cbranch_execz BB7_103 | |
BB7_100: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:16 offset1:17 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v64, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_102 | |
s_cbranch_execz BB7_102 | |
BB7_101: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 17, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:64 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v85 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v87, v75, v90 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v85 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v85, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v85 | |
v_mul_f32_e32 v90, v85, v85 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v85 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v85 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v85 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v85, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v85, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v82, -v84, v81, v82 | |
v_mad_f32 v44, v81, v84, v44 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v79, -v84, v80, v79 | |
v_mad_f32 v43, v80, v84, v43 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v78, v76 | |
v_mad_f32 v42, v78, v84, v42 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v85|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v85 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v85 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v85 | |
v_mad_f32 v85, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v85, 1.0, v85, vcc | |
v_cndmask_b32_e64 v85, v85, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v85, v90, v85 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v85, v85, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v85, v85, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v85, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v85, s19, -v88 | |
v_mac_f32_e32 v85, v83, v89 | |
v_mac_f32_e32 v5, v85, v87 | |
v_mac_f32_e32 v45, v0, v67 | |
BB7_102: ; %Flow1230 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_103: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 18, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_107 | |
s_cbranch_execz BB7_107 | |
BB7_104: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:32 offset1:33 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v63, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_106 | |
s_cbranch_execz BB7_106 | |
BB7_105: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 18, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:128 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v85 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v87, v75, v90 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v85 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v85, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v85 | |
v_mul_f32_e32 v90, v85, v85 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v85 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v85 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v85 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v85, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v85, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v82, -v84, v81, v82 | |
v_mad_f32 v34, v81, v84, v34 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v79, -v84, v80, v79 | |
v_mad_f32 v33, v80, v84, v33 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v78, v76 | |
v_mad_f32 v32, v78, v84, v32 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v85|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v85 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v85 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v85 | |
v_mad_f32 v85, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v85, 1.0, v85, vcc | |
v_cndmask_b32_e64 v85, v85, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v85, v90, v85 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v85, v85, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v85, v85, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v85, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v85, s19, -v88 | |
v_mac_f32_e32 v85, v83, v89 | |
v_mac_f32_e32 v5, v85, v87 | |
v_mac_f32_e32 v35, v0, v67 | |
BB7_106: ; %Flow1229 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_107: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 19, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_111 | |
s_cbranch_execz BB7_111 | |
BB7_108: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:48 offset1:49 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v62, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_110 | |
s_cbranch_execz BB7_110 | |
BB7_109: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 19, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:192 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v85 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v87, v75, v90 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v85 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v85, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v85 | |
v_mul_f32_e32 v90, v85, v85 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v85 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v85 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_e32 v92, v90, v92, 0x451f90ce | |
v_madak_f32_e32 v94, v90, v94, 0x43d6810b | |
v_madak_f32_e32 v92, v90, v92, 0x4547fdbb | |
v_madak_f32_e32 v94, v90, v94, 0x442158c9 | |
v_madak_f32_e32 v92, v90, v92, 0x44c01759 | |
v_madak_f32_e32 v94, v90, v94, 0x43d9486f | |
v_madak_f32_e32 v92, v90, v92, 0x43a2e571 | |
v_madak_f32_e32 v94, v90, v94, 0x4309a863 | |
v_cmp_gt_f32_e32 vcc, v95, v85 | |
v_madak_f32_e32 v92, v90, v92, 0x41f2b459 | |
v_madak_f32_e32 v94, v90, v94, 0x419d35ce | |
v_cndmask_b32_e32 v92, v92, v94, vcc | |
v_mov_b32_e32 v94, 0x3c445aa3 | |
v_madak_f32_e32 v94, v94, v90, 0x3c5f6e13 | |
v_madak_f32_e32 v94, v90, v94, 0x3e013307 | |
v_madak_f32_e32 v94, v90, v94, 0x3d931ae7 | |
v_madak_f32_e32 v94, v90, v94, 0x3f0a5785 | |
v_mov_b32_e32 v93, 0xc11d077e | |
v_madak_f32_e32 v94, v90, v94, 0x3dd9f331 | |
v_mov_b32_e32 v91, 0xc3f1c275 | |
v_madak_f32_e32 v93, v93, v90, 0xc2a2932b | |
v_cndmask_b32_e64 v92, v92, v94, s[4:5] | |
v_mov_b32_e32 v94, 0xb684e21a | |
v_madak_f32_e32 v91, v91, v90, 0xc480230b | |
v_madak_f32_e32 v93, v90, v93, 0xc3389ae7 | |
v_madak_f32_e32 v94, v94, v90, 0x390aee49 | |
v_madak_f32_e32 v91, v90, v91, 0xc41f6441 | |
v_madak_f32_e32 v93, v90, v93, 0xc322658c | |
v_madak_f32_e32 v94, v90, v94, 0x3ba68116 | |
v_madak_f32_e32 v91, v90, v91, 0xc320a2ea | |
v_madak_f32_e32 v93, v90, v93, 0xc2798057 | |
v_madak_f32_e32 v94, v90, v94, 0x3d852a63 | |
v_madak_f32_e32 v91, v90, v91, 0xc18e104b | |
v_madak_f32_e32 v93, v90, v93, 0xc128f022 | |
v_madak_f32_e32 v94, v90, v94, 0x3ecbbbce | |
v_madak_f32_e32 v91, v90, v91, 0xbf4c9dd4 | |
v_madak_f32_e32 v93, v90, v93, 0xbf31a0b7 | |
v_cndmask_b32_e64 v92, v92, v94, s[8:9] | |
v_and_b32_e32 v94, s51, v83 | |
v_mov_b32_e32 v95, 0xbf100000 | |
v_madak_f32_e32 v91, v90, v91, 0xbc21a092 | |
v_madak_f32_e32 v93, v90, v93, 0xbc21a093 | |
v_mad_f32 v95, v94, -v94, v95 | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cmp_gt_f32_e32 vcc, 0, v95 | |
v_cndmask_b32_e64 v96, 0.5, -0.5, vcc | |
v_mov_b32_e32 v97, 0x3fb8aa3b | |
v_mac_f32_e32 v96, v97, v95 | |
v_cvt_i32_f32_e32 v96, v96 | |
v_mov_b32_e32 v93, 0xbb0df9c0 | |
v_madak_f32_e32 v93, v93, v90, 0x3d1151b3 | |
v_madak_f32_e32 v93, v90, v93, 0xbde31cc2 | |
v_cvt_f32_i32_e32 v100, v96 | |
v_madak_f32_e32 v93, v90, v93, 0x3ea2fe54 | |
v_mov_b32_e32 v101, 0xbf317180 | |
v_madak_f32_e32 v93, v90, v93, 0xbebe9208 | |
v_mad_f32 v102, v101, v100, v95 | |
v_mov_b32_e32 v103, 0xb717f7d1 | |
v_madak_f32_e32 v93, v90, v93, 0x3ed46805 | |
v_mad_f32 v104, v103, v100, v102 | |
v_madak_f32_e32 v93, v90, v93, 0xbb1acdc6 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mov_b32_e32 v106, 0xb5ddea0e | |
v_mov_b32_e32 v107, 0x3331bb4c | |
v_cndmask_b32_e64 v91, v91, v93, s[4:5] | |
v_mov_b32_e32 v93, 0xb7c756b1 | |
v_mad_f32 v108, v107, v105, v106 | |
v_mov_b32_e32 v109, 0x388ab355 | |
v_madak_f32_e32 v93, v93, v90, 0xbbbd1489 | |
v_mad_f32 v108, v108, v105, v109 | |
v_mov_b32_e32 v110, 0xbb360b61 | |
v_madak_f32_e32 v93, v90, v93, 0xbce9528f | |
v_mad_f32 v108, v108, v105, v110 | |
v_mov_b32_e32 v111, 0x3e2aaaab | |
v_madak_f32_e32 v93, v90, v93, 0xbea66beb | |
v_mad_f32 v108, v108, v105, v111 | |
v_madak_f32_e32 v93, v90, v93, 0x3e0375d4 | |
v_mad_f32 v105, -v105, v108, v104 | |
v_mad_f32 v90, v90, v92, 1.0 | |
v_mov_b32_e32 v92, 0x6f800000 | |
v_cmp_gt_f32_e64 vcc, |v90|, v92 | |
v_mov_b32_e32 v98, 0x2f800000 | |
v_sub_f32_e32 v108, 2.0, v105 | |
v_cndmask_b32_e32 v99, 1.0, v98, vcc | |
v_cmp_gt_f32_e64 vcc, |v108|, v92 | |
v_mul_f32_e32 v90, v99, v90 | |
v_cndmask_b32_e32 v112, 1.0, v98, vcc | |
v_mul_f32_e64 v108, v108, -v112 | |
v_rcp_f32_e32 v90, v90 | |
v_rcp_f32_e32 v108, v108 | |
v_cndmask_b32_e64 v91, v91, v93, s[8:9] | |
v_lshlrev_b32_e32 v93, 23, v96 | |
v_mul_f32_e32 v90, v90, v91 | |
v_mul_f32_e32 v91, v105, v104 | |
v_mul_f32_e32 v91, v108, v91 | |
v_mul_f32_e32 v91, v91, v112 | |
v_mad_f32 v91, -v100, v103, v91 | |
v_subrev_f32_e32 v91, v102, v91 | |
v_sub_f32_e32 v91, 1.0, v91 | |
v_add_i32_e32 v91, vcc, v91, v93 | |
v_mov_b32_e32 v93, 0xc2aeac4f | |
v_cmp_nlt_f32_e32 vcc, v95, v93 | |
v_mov_b32_e32 v96, 0x42b17218 | |
v_cndmask_b32_e32 v91, 0, v91, vcc | |
v_cmp_lt_f32_e32 vcc, v95, v96 | |
v_mov_b32_e32 v100, 0x7f800000 | |
v_cndmask_b32_e32 v91, v100, v91, vcc | |
v_cmp_u_f32_e32 vcc, v95, v95 | |
v_cndmask_b32_e32 v91, v91, v95, vcc | |
v_subrev_f32_e32 v95, v85, v94 | |
v_mul_f32_e32 v102, v90, v99 | |
v_add_f32_e32 v94, v85, v94 | |
v_mad_f32 v94, v94, v95, v102 | |
v_cmp_gt_f32_e32 vcc, 0, v94 | |
v_cndmask_b32_e64 v95, 0.5, -0.5, vcc | |
v_mac_f32_e32 v95, v97, v94 | |
v_cvt_i32_f32_e32 v95, v95 | |
v_madak_f32_e32 v90, v99, v90, 0x3f58560b | |
v_mad_f32 v82, -v84, v81, v82 | |
v_mad_f32 v30, v81, v84, v30 | |
v_cvt_f32_i32_e32 v97, v95 | |
v_lshlrev_b32_e32 v95, 23, v95 | |
v_mad_f32 v79, -v84, v80, v79 | |
v_mad_f32 v29, v80, v84, v29 | |
v_mad_f32 v101, v101, v97, v94 | |
v_mad_f32 v104, v103, v97, v101 | |
v_mul_f32_e32 v105, v104, v104 | |
v_mac_f32_e32 v106, v107, v105 | |
v_mac_f32_e32 v109, v106, v105 | |
v_mac_f32_e32 v110, v109, v105 | |
v_mac_f32_e32 v111, v110, v105 | |
v_mad_f32 v105, -v105, v111, v104 | |
v_mul_f32_e32 v104, v105, v104 | |
v_sub_f32_e32 v105, 2.0, v105 | |
v_cmp_gt_f32_e64 vcc, |v105|, v92 | |
v_cndmask_b32_e32 v106, 1.0, v98, vcc | |
v_mul_f32_e64 v105, v105, -v106 | |
v_rcp_f32_e32 v105, v105 | |
v_mad_f32 v76, -v84, v78, v76 | |
v_mad_f32 v28, v78, v84, v28 | |
v_mad_f32 v86, -v67, v0, v86 | |
v_mul_f32_e32 v104, v105, v104 | |
v_mul_f32_e32 v104, v104, v106 | |
v_mad_f32 v97, -v97, v103, v104 | |
v_subrev_f32_e32 v97, v101, v97 | |
v_sub_f32_e32 v97, 1.0, v97 | |
v_add_i32_e32 v95, vcc, v97, v95 | |
v_cmp_nlt_f32_e32 vcc, v94, v93 | |
v_cndmask_b32_e32 v93, 0, v95, vcc | |
v_cmp_lt_f32_e32 vcc, v94, v96 | |
v_cndmask_b32_e32 v93, v100, v93, vcc | |
v_cmp_gt_f32_e64 vcc, |v85|, v92 | |
v_cndmask_b32_e32 v92, 1.0, v98, vcc | |
v_mul_f32_e32 v95, v92, v85 | |
v_rcp_f32_e32 v95, v95 | |
v_cmp_u_f32_e32 vcc, v94, v94 | |
v_cndmask_b32_e32 v93, v93, v94, vcc | |
v_mul_f32_e32 v91, v93, v91 | |
v_mov_b32_e32 v93, 0x40c00000 | |
v_cmp_gt_f32_e32 vcc, v93, v85 | |
v_mov_b32_e32 v93, 0x31800000 | |
v_mul_f32_e32 v91, v95, v91 | |
v_cmp_gt_f32_e64 s[10:11], v93, v85 | |
v_mad_f32 v85, -v92, v91, 1.0 | |
v_cndmask_b32_e32 v85, 1.0, v85, vcc | |
v_cndmask_b32_e64 v85, v85, v90, s[4:5] | |
v_and_b32_e32 v90, s52, v83 | |
v_or_b32_e32 v85, v90, v85 | |
v_mad_f32 v90, v102, v83, v83 | |
v_cndmask_b32_e64 v85, v85, v90, s[8:9] | |
v_mul_f32_e32 v90, 0x3f8375d4, v83 | |
v_mac_f32_e32 v90, 0x41000000, v83 | |
v_mul_f32_e32 v90, 0x3e000000, v90 | |
v_cndmask_b32_e64 v85, v85, v90, s[10:11] | |
v_cmp_u_f32_e32 vcc, v83, v83 | |
v_cndmask_b32_e32 v83, v85, v83, vcc | |
v_subrev_f32_e32 v83, v83, v88 | |
v_mul_f32_e64 v85, s19, -v88 | |
v_mac_f32_e32 v85, v83, v89 | |
v_mac_f32_e32 v5, v85, v87 | |
v_mac_f32_e32 v31, v0, v67 | |
BB7_110: ; %Flow1228 | |
; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[34:35] | |
BB7_111: ; in Loop: Header=BB7_11 Depth=1 | |
s_or_b64 exec, exec, s[12:13] | |
v_lshrrev_b32_e32 v67, 20, v69 | |
v_and_b32_e32 v67, 1, v67 | |
v_cmp_eq_u32_e32 vcc, 1, v67 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[12:13], exec, s[4:5] | |
; mask branch BB7_115 | |
s_cbranch_execz BB7_115 | |
BB7_112: ; in Loop: Header=BB7_11 Depth=1 | |
s_mov_b32 m0, -1 | |
ds_read2_b64 v[87:90], v55 offset0:64 offset1:65 | |
s_and_b64 s[4:5], exec, s[0:1] | |
v_cmp_ne_u32_e32 vcc, v61, v77 | |
s_xor_b64 s[4:5], s[4:5], -1 | |
s_or_b64 s[4:5], s[4:5], vcc | |
s_waitcnt lgkmcnt(0) | |
v_subrev_f32_e32 v80, v73, v88 | |
s_and_b64 s[8:9], exec, s[2:3] | |
s_or_b64 s[4:5], s[8:9], s[4:5] | |
v_subrev_f32_e32 v78, v72, v87 | |
v_mul_f32_e32 v83, v80, v80 | |
v_cndmask_b32_e64 v84, 0, 1.0, s[4:5] | |
v_subrev_f32_e32 v81, v74, v89 | |
v_mac_f32_e32 v83, v78, v78 | |
v_mac_f32_e32 v83, v81, v81 | |
v_mul_f32_e32 v84, s26, v84 | |
v_subrev_f32_e32 v67, v75, v90 | |
v_cmp_lt_f32_e32 vcc, v83, v84 | |
s_and_saveexec_b64 s[4:5], vcc | |
s_xor_b64 s[34:35], exec, s[4:5] | |
; mask branch BB7_114 | |
s_cbranch_execz BB7_114 | |
BB7_113: ; in Loop: Header=BB7_11 Depth=1 | |
v_max_f32_e32 v85, 0x34cd15ae, v83 | |
v_rsq_f32_e32 v89, v85 | |
v_lshrrev_b32_e32 v83, 20, v65 | |
v_and_b32_e32 v83, 1, v83 | |
s_mov_b32 m0, -1 | |
v_cmp_eq_u32_e64 s[4:5], 1, v83 | |
ds_read_b64 v[83:84], v56 offset:256 | |
v_mul_f32_e32 v93, v89, v89 | |
v_cndmask_b32_e64 v88, 0, 1.0, s[4:5] | |
v_mul_f32_e32 v94, v93, v93 | |
v_mul_f32_e32 v94, v88, v94 | |
v_mul_f32_e32 v95, v93, v94 | |
v_mad_f32 v94, v94, v93, s42 | |
v_cmp_gt_f32_e32 vcc, s27, v85 | |
s_waitcnt lgkmcnt(0) | |
v_mul_f32_e32 v83, v70, v83 | |
v_mul_f32_e32 v84, v71, v84 | |
v_mad_f32 v97, v95, v95, s43 | |
v_mul_f32_e32 v94, 0xbe2aaaab, v94 | |
v_mad_f32 v96, v84, v95, -v83 | |
v_mul_f32_e32 v87, v75, v90 | |
v_cndmask_b32_e64 v90, 0, 1.0, vcc | |
v_mul_f32_e32 v84, v97, v84 | |
v_mul_f32_e32 v83, v83, v94 | |
v_mac_f32_e32 v83, 0x3daaaaaa, v84 | |
v_mul_f32_e32 v91, v88, v90 | |
v_mul_f32_e32 v92, v9, v85 | |
v_mul_f32_e32 v84, v90, v93 | |
v_mac_f32_e32 v8, v83, v91 | |
v_mul_f32_e32 v83, v92, v92 | |
v_mov_b32_e32 v90, 0x3a92b707 | |
v_madak_f32_e32 v90, v90, v83, 0x3ded3cb2 | |
v_mov_b32_e32 v91, 0x3c739487 | |
v_madak_f32_e32 v91, v91, v83, 0x3f01e2bc | |
v_mad_f32 v90, v90, v83, 1.0 | |
v_mac_f32_e32 v90, v92, v91 | |
v_mov_b32_e32 v91, 0xb2951928 | |
v_rcp_f32_e32 v90, v90 | |
v_madak_f32_e32 v91, v91, v83, 0xb85ffb93 | |
v_mov_b32_e32 v94, 0x35c55945 | |
v_madak_f32_e32 v94, v94, v83, 0x3a83ca0c | |
v_madak_f32_e32 v91, v91, v83, 0xbc9ded90 | |
v_madak_f32_e32 v94, v94, v83, 0x3d8eaf3b | |
v_madak_f32_e32 v83, v91, v83, 0xbf409397 | |
v_mac_f32_e32 v83, v92, v94 | |
v_mul_f32_e32 v90, v37, v90 | |
v_mul_f32_e32 v84, v95, v84 | |
v_mul_f32_e32 v83, v83, v90 | |
v_mul_f32_e32 v90, v88, v93 | |
v_mac_f32_e32 v83, v89, v90 | |
v_mul_f32_e32 v84, v96, v84 | |
v_mac_f32_e32 v84, v83, v87 | |
v_mul_f32_e32 v83, s18, v85 | |
v_mul_f32_e32 v83, v89, v83 | |
v_and_b32_e32 v85, s50, v83 | |
v_mov_b32_e32 v90, 0x3fa00000 | |
v_cmp_gt_f32_e64 s[4:5], v90, v85 | |
v_mul_f32_e32 v90, v85, v85 | |
v_rcp_f32_e32 v91, v90 | |
v_add_f32_e32 v92, -1.0, v85 | |
v_mov_b32_e32 v94, 0xbd777f97 | |
v_mov_b32_e32 v95, 0x4036db6e | |
v_cndmask_b32_e64 v91, v91, v92, s[4:5] | |
v_mov_b32_e32 v92, 0x3f580000 | |
v_cmp_gt_f32_e64 s[8:9], v92, v85 | |
v_cndmask_b32_e64 v90, v91, v90, s[8:9] | |
v_mov_b32_e32 v92, 0xc1b38712 | |
v_madak_f32_e32 v94, v94, v90, 0x40d23f7c | |
v_madak_f32_e32 v92, v92, v90, 0x43ed43a7 | |
v_madak_f32_e32 v94, v90, v94, 0x42d9451f | |
v_madak_f32_ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment