From 6b0d752b7fda3e4721f89878300c83d8bd0a3e4f Mon Sep 17 00:00:00 2001 From: Mirko Brkusanin Date: Thu, 28 Jan 2021 11:12:12 +0100 Subject: [PATCH] [AMDGPU][GlobalISel] Remove redundant cmp when copying constant to vcc Differential Revision: https://reviews.llvm.org/D95540 --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 31 +++++--- .../GlobalISel/divergent-control-flow.ll | 2 +- .../AMDGPU/GlobalISel/inst-select-copy.mir | 79 +++++++++++++++++++ .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 16 ++-- test/CodeGen/AMDGPU/GlobalISel/localizer.ll | 3 +- 5 files changed, 109 insertions(+), 22 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index bd577a6fb8c..de92ce0e4dd 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -136,20 +136,29 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { const TargetRegisterClass *SrcRC = TRI.getConstrainedRegClassForOperand(Src, *MRI); - Register MaskedReg = MRI->createVirtualRegister(SrcRC); + Optional ConstVal = + getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true); + if (ConstVal) { + unsigned MovOpc = + STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; + BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg) + .addImm(ConstVal->Value.getBoolValue() ? -1 : 0); + } else { + Register MaskedReg = MRI->createVirtualRegister(SrcRC); - // We can't trust the high bits at this point, so clear them. + // We can't trust the high bits at this point, so clear them. - // TODO: Skip masking high bits if def is known boolean. + // TODO: Skip masking high bits if def is known boolean. - unsigned AndOpc = TRI.isSGPRClass(SrcRC) ? - AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; - BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) - .addImm(1) - .addReg(SrcReg); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) - .addImm(0) - .addReg(MaskedReg); + unsigned AndOpc = + TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; + BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) + .addImm(1) + .addReg(SrcReg); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) + .addImm(0) + .addReg(MaskedReg); + } if (!MRI->getRegClassOrNull(SrcReg)) MRI->setRegClass(SrcReg, SrcRC); diff --git a/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 51d6ed2292c..4dd010994fc 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -219,7 +219,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_u32_e32 v1, 1, v1 ; CHECK-NEXT: v_cmp_le_i32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 1 +; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_cbranch_vccnz BB5_1 ; CHECK-NEXT: ; %bb.3: ; %bb4 ; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1 diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir index 8983dfec558..d025784307d 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir @@ -321,3 +321,82 @@ body: | S_ENDPGM 0, implicit %0 ... + +--- + +name: copy_s1_to_vcc +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; WAVE64-LABEL: name: copy_s1_to_vcc + ; WAVE64: liveins: $sgpr0_sgpr1 + ; WAVE64: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; WAVE64: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY1]], implicit-def $scc + ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec + ; WAVE64: S_ENDPGM 0, implicit [[V_CMP_NE_U32_e64_]] + ; WAVE32-LABEL: name: copy_s1_to_vcc + ; WAVE32: liveins: $sgpr0_sgpr1 + ; WAVE32: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; WAVE32: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY1]], implicit-def $scc + ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec + ; WAVE32: S_ENDPGM 0, implicit [[V_CMP_NE_U32_e64_]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:sgpr(s1) = G_TRUNC %0(s64) + %2:vcc(s1) = COPY %1(s1) + S_ENDPGM 0, implicit %2 + +... + +--- + +name: copy_s1_false_to_vcc +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: copy_s1_false_to_vcc + ; WAVE64: liveins: $sgpr0 + ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; WAVE64: S_ENDPGM 0, implicit [[S_MOV_B64_]] + ; WAVE32-LABEL: name: copy_s1_false_to_vcc + ; WAVE32: liveins: $sgpr0 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; WAVE32: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s1) = G_CONSTANT i1 false + %1:vcc(s1) = COPY %0(s1) + S_ENDPGM 0, implicit %1 + +... + +--- + +name: copy_s1_true_to_vcc +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: copy_s1_true_to_vcc + ; WAVE64: liveins: $sgpr0 + ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; WAVE64: S_ENDPGM 0, implicit [[S_MOV_B64_]] + ; WAVE32-LABEL: name: copy_s1_true_to_vcc + ; WAVE32: liveins: $sgpr0 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; WAVE32: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s1) = G_CONSTANT i1 true + %1:vcc(s1) = COPY %0(s1) + S_ENDPGM 0, implicit %1 + +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index f25ec6c7370..6c9f8575e3d 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -672,7 +672,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa ; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 ; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x25 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0 +; GFX7-NEXT: s_mov_b64 vcc, 0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 @@ -688,7 +688,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0 +; GFX8-NEXT: s_mov_b64 vcc, 0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -707,7 +707,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa ; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 0 +; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 @@ -723,7 +723,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa ; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 0 +; GFX10_W64-NEXT: s_mov_b64 vcc, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 @@ -743,7 +743,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac ; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 ; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x25 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1 +; GFX7-NEXT: s_mov_b64 vcc, -1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 @@ -759,7 +759,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1 +; GFX8-NEXT: s_mov_b64 vcc, -1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -778,7 +778,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac ; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, 1 +; GFX10_W32-NEXT: s_mov_b32 vcc_lo, -1 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 @@ -794,7 +794,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac ; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, 1 +; GFX10_W64-NEXT: s_mov_b64 vcc, -1 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 diff --git a/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index a2d8d05c80d..0472cd5d404 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -170,8 +170,7 @@ define void @localize_internal_globals(i1 %cond) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 -; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, -1 ; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] ; GFX9-NEXT: s_cbranch_execz BB2_2