From dd6ae694d679fdb80a408d4cef1af0e25a0dbdef Mon Sep 17 00:00:00 2001
From: Jakob Stoklund Olesen <stoklund@2pi.dk>
Date: Mon, 23 Jan 2012 21:01:15 +0000
Subject: [PATCH] Fix PR11829. PostRA LICM was too aggressive.

This fixes a typo in r148589.

llvm-svn: 148724
---
 lib/CodeGen/MachineLICM.cpp                |   8 +-
 test/CodeGen/ARM/2012-01-23-PostRA-LICM.ll | 105 +++++++++++++++++++++
 2 files changed, 109 insertions(+), 4 deletions(-)
 create mode 100644 test/CodeGen/ARM/2012-01-23-PostRA-LICM.ll
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index b9028ec6818..49a109e252c 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -462,13 +462,13 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
     // register, then this is not safe.  Two defs is indicated by setting a
     // PhysRegClobbers bit.
     for (const unsigned *AS = TRI->getOverlaps(Reg); *AS; ++AS) {
-      if (PhysRegDefs.test(Reg))
-        PhysRegClobbers.set(Reg);
-      if (PhysRegClobbers.test(Reg))
+      if (PhysRegDefs.test(*AS))
+        PhysRegClobbers.set(*AS);
+      if (PhysRegClobbers.test(*AS))
         // MI defined register is seen defined by another instruction in
         // the loop, it cannot be a LICM candidate.
         RuledOut = true;
-      PhysRegDefs.set(Reg);
+      PhysRegDefs.set(*AS);
     }
   }
 
diff --git a/test/CodeGen/ARM/2012-01-23-PostRA-LICM.ll b/test/CodeGen/ARM/2012-01-23-PostRA-LICM.ll
new file mode 100644
index 00000000000..926daafbb7f
--- /dev/null
+++ b/test/CodeGen/ARM/2012-01-23-PostRA-LICM.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -mcpu=cortex-a8 -verify-machineinstrs
+; PR11829
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7-none-linux-gnueabi"
+
+define arm_aapcs_vfpcc void @foo(i8* nocapture %arg) nounwind uwtable align 2 {
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:                                              ; preds = %bb
+  unreachable
+
+bb2:                                              ; preds = %bb
+  br label %bb3
+
+bb3:                                              ; preds = %bb4, %bb2
+  %tmp = icmp slt i32 undef, undef
+  br i1 %tmp, label %bb4, label %bb67
+
+bb4:                                              ; preds = %bb3
+  %tmp5 = load <4 x i32>* undef, align 16, !tbaa !0
+  %tmp6 = and <4 x i32> %tmp5, <i32 8388607, i32 8388607, i32 8388607, i32 8388607>
+  %tmp7 = or <4 x i32> %tmp6, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %tmp8 = bitcast <4 x i32> %tmp7 to <4 x float>
+  %tmp9 = fsub <4 x float> %tmp8, bitcast (i128 or (i128 shl (i128 zext (i64 trunc (i128 lshr (i128 bitcast (<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> to i128), i128 64) to i64) to i128), i128 64), i128 zext (i64 trunc (i128 bitcast (<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> to i128) to i64) to i128)) to <4 x float>)
+  %tmp10 = fmul <4 x float> undef, %tmp9
+  %tmp11 = fadd <4 x float> undef, %tmp10
+  %tmp12 = bitcast <4 x float> zeroinitializer to i128
+  %tmp13 = lshr i128 %tmp12, 64
+  %tmp14 = trunc i128 %tmp13 to i64
+  %tmp15 = insertvalue [2 x i64] undef, i64 %tmp14, 1
+  %tmp16 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %tmp11) nounwind
+  %tmp17 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp16, <4 x float> %tmp11) nounwind
+  %tmp18 = fmul <4 x float> %tmp17, %tmp16
+  %tmp19 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp18, <4 x float> %tmp11) nounwind
+  %tmp20 = fmul <4 x float> %tmp19, %tmp18
+  %tmp21 = fmul <4 x float> %tmp20, zeroinitializer
+  %tmp22 = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %tmp21, <4 x float> undef) nounwind
+  call arm_aapcs_vfpcc  void @bar(i8* null, i8* undef, <4 x i32>* undef, [2 x i64] zeroinitializer) nounwind
+  %tmp23 = bitcast <4 x float> %tmp22 to i128
+  %tmp24 = trunc i128 %tmp23 to i64
+  %tmp25 = insertvalue [2 x i64] undef, i64 %tmp24, 0
+  %tmp26 = insertvalue [2 x i64] %tmp25, i64 0, 1
+  %tmp27 = load float* undef, align 4, !tbaa !2
+  %tmp28 = insertelement <4 x float> undef, float %tmp27, i32 3
+  %tmp29 = load <4 x i32>* undef, align 16, !tbaa !0
+  %tmp30 = and <4 x i32> %tmp29, <i32 8388607, i32 8388607, i32 8388607, i32 8388607>
+  %tmp31 = or <4 x i32> %tmp30, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %tmp32 = bitcast <4 x i32> %tmp31 to <4 x float>
+  %tmp33 = fsub <4 x float> %tmp32, bitcast (i128 or (i128 shl (i128 zext (i64 trunc (i128 lshr (i128 bitcast (<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> to i128), i128 64) to i64) to i128), i128 64), i128 zext (i64 trunc (i128 bitcast (<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> to i128) to i64) to i128)) to <4 x float>)
+  %tmp34 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> undef, <4 x float> %tmp28) nounwind
+  %tmp35 = fmul <4 x float> %tmp34, undef
+  %tmp36 = fmul <4 x float> %tmp35, undef
+  %tmp37 = call arm_aapcs_vfpcc  i8* undef(i8* undef) nounwind
+  %tmp38 = load float* undef, align 4, !tbaa !2
+  %tmp39 = insertelement <2 x float> undef, float %tmp38, i32 0
+  %tmp40 = call arm_aapcs_vfpcc  i8* undef(i8* undef) nounwind
+  %tmp41 = load float* undef, align 4, !tbaa !2
+  %tmp42 = insertelement <4 x float> undef, float %tmp41, i32 3
+  %tmp43 = shufflevector <2 x float> %tmp39, <2 x float> undef, <4 x i32> zeroinitializer
+  %tmp44 = fmul <4 x float> %tmp33, %tmp43
+  %tmp45 = fadd <4 x float> %tmp42, %tmp44
+  %tmp46 = fsub <4 x float> %tmp45, undef
+  %tmp47 = fmul <4 x float> %tmp46, %tmp36
+  %tmp48 = fadd <4 x float> undef, %tmp47
+  %tmp49 = call arm_aapcs_vfpcc  i8* undef(i8* undef) nounwind
+  %tmp50 = load float* undef, align 4, !tbaa !2
+  %tmp51 = insertelement <4 x float> undef, float %tmp50, i32 3
+  %tmp52 = call arm_aapcs_vfpcc float* null(i8* undef) nounwind
+  %tmp54 = load float* %tmp52, align 4, !tbaa !2
+  %tmp55 = insertelement <4 x float> undef, float %tmp54, i32 3
+  %tmp56 = fsub <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %tmp22
+  %tmp57 = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %tmp56, <4 x float> %tmp55) nounwind
+  %tmp58 = fmul <4 x float> undef, %tmp57
+  %tmp59 = fsub <4 x float> %tmp51, %tmp48
+  %tmp60 = fsub <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %tmp58
+  %tmp61 = fmul <4 x float> %tmp59, %tmp60
+  %tmp62 = fadd <4 x float> %tmp48, %tmp61
+  call arm_aapcs_vfpcc  void @baz(i8* undef, i8* undef, [2 x i64] %tmp26, <4 x i32>* undef)
+  %tmp63 = bitcast <4 x float> %tmp62 to i128
+  %tmp64 = lshr i128 %tmp63, 64
+  %tmp65 = trunc i128 %tmp64 to i64
+  %tmp66 = insertvalue [2 x i64] zeroinitializer, i64 %tmp65, 1
+  call arm_aapcs_vfpcc  void @quux(i8* undef, i8* undef, [2 x i64] undef, i8* undef, [2 x i64] %tmp66, i8* undef, i8* undef, [2 x i64] %tmp26, [2 x i64] %tmp15, <4 x i32>* undef)
+  br label %bb3
+
+bb67:                                             ; preds = %bb3
+  ret void
+}
+
+declare arm_aapcs_vfpcc void @bar(i8*, i8*, <4 x i32>*, [2 x i64])
+
+declare arm_aapcs_vfpcc void @baz(i8*, i8* nocapture, [2 x i64], <4 x i32>* nocapture) nounwind uwtable inlinehint align 2
+
+declare arm_aapcs_vfpcc void @quux(i8*, i8*, [2 x i64], i8* nocapture, [2 x i64], i8* nocapture, i8* nocapture, [2 x i64], [2 x i64], <4 x i32>* nocapture) nounwind uwtable inlinehint align 2
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!2 = metadata !{metadata !"float", metadata !0}