AMDGPU: Fix immediate folding logic when shrinking instructions

If the literal is being folded into src0, it doesn't matter if it's an SGPR because it's being replaced with the literal. Also fixes initially selecting 32-bit versions of some instructions which also confused commuting. llvm-svn: 281117
2024-11-23 03:02:36 +01:00 · 2016-09-09 23:32:53 +00:00 · 2016-09-09 23:32:53 +00:00 · 33e593fd71
commit 33e593fd71
parent 5e10e55c23
10 changed files with 43 additions and 39 deletions
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -1852,13 +1852,13 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
  case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
-  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
-  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
-  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
-  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32;
-  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32;
-  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32;
-  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32;
+  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
+  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
+  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
+  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
+  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
+  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
+  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -1871,7 +1871,7 @@ def : Pat <

 def : Pat <
  (fneg (fabs f32:$src)),
-  (S_OR_B32 $src, 0x80000000) // Set sign bit
+  (S_OR_B32 $src, (S_MOV_B32 0x80000000)) // Set sign bit
 >;

 // FIXME: Should use S_OR_B32
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@ -134,7 +134,6 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,

  assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));

-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
  MachineOperand &Src0 = MI.getOperand(Src0Idx);

@ -144,12 +143,6 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
      TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
    return;

-  // Literal constants and SGPRs can only be used in Src0, so if Src0 is an
-  // SGPR, we cannot commute the instruction, so we can't fold any literal
-  // constants.
-  if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI))
-    return;
-
  // Try to fold Src0
  if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
    unsigned Reg = Src0.getReg();
@ -158,7 +151,8 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
      MachineOperand &MovSrc = Def->getOperand(1);
      bool ConstantFolded = false;

-      if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
+      if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
+                             isUInt<32>(MovSrc.getImm()))) {
        Src0.ChangeToImmediate(MovSrc.getImm());
        ConstantFolded = true;
      }
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@ -143,7 +143,7 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
 ; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
 ; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
 ; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
-; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
+; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
 ; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]
 ; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc
 ; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
--- a/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@ -82,8 +82,10 @@ define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in)
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 ; R600: -PV

-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+; FIXME: In this case two uses of the constant should be folded
+; SI: s_mov_b32 [[SIGNBITK:s[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
  %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
@ -92,10 +94,11 @@ define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 }

 ; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+; SI: s_mov_b32 [[SIGNBITK:s[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
  %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@ -16,7 +16,7 @@ define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
 ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
 ; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
-; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
+; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]]
 ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: s_endpgm
 define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
@ -440,7 +440,7 @@ define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspa
 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
-; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]]
+; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
 ; GCN-DAG: buffer_store_dword [[PACKED]]
 ; GCN: s_endpgm
 define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@ -10,9 +10,7 @@ declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
 ; VI: s_load_dword [[SRC:s[0-9]+]]
 ; VI-DAG: v_rsq_f32_e32 [[RSQ:v[0-9]+]], [[SRC]]
 ; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
-; TODO: this constant should be folded:
-; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff7fffff
-; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[K]]
+; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xff7fffff, [[MIN]]
 ; VI: buffer_store_dword [[RESULT]]
 define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
  %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@ -113,11 +113,9 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 }

 ; FUNC-LABEL: {{^}}vector_or_i64_loadimm:
-; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
-; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f
 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
 define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
  %loada = load i64, i64 addrspace(1)* %a, align 8
--- a/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_movk_i32.ll
@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s

 ; SI-LABEL: {{^}}s_movk_i32_k0:
@ -11,6 +11,7 @@ define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 4295032831)
  ret void
 }

@ -24,6 +25,7 @@ define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 4295000063)
  ret void
 }

@ -37,6 +39,7 @@ define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 274877939711)
  ret void
 }

@ -50,6 +53,7 @@ define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 4295000064)
  ret void
 }

@ -63,6 +67,7 @@ define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 4295098368)
  ret void
 }

@ -77,6 +82,7 @@ define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 18374967954648334319)
  ret void
 }

@ -90,6 +96,7 @@ define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 270582939713)
  ret void
 }

@ -104,10 +111,10 @@ define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 70368744185856)
  ret void
 }

-
 ; SI-LABEL: {{^}}s_movk_i32_k8:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
 ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
@ -119,6 +126,7 @@ define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 1229782942255906816)
  ret void
 }

@ -133,6 +141,7 @@ define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 1229782942255906817)
  ret void
 }

@ -147,6 +156,7 @@ define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 1229782942255909000)
  ret void
 }

@ -161,6 +171,7 @@ define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 1229782942255910911)
  ret void
 }

@ -175,5 +186,6 @@ define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
  %loada = load i64, i64 addrspace(1)* %a, align 4
  %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
  store i64 %or, i64 addrspace(1)* %out
+  call void asm sideeffect "; use $0", "s"(i64 1229782942255902721)
  ret void
 }
--- a/test/CodeGen/AMDGPU/si-literal-folding.ll
+++ b/test/CodeGen/AMDGPU/si-literal-folding.ll
@ -1,9 +1,8 @@
-; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-LABEL: {{^}}main:
-; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0xbf4353f8
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

+; GCN-LABEL: {{^}}main:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}}
 define amdgpu_vs void @main(float) {
 main_body:
  %1 = fmul float %0, 0x3FE86A7F00000000