diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h index 93187dd4d96..0bb58ba78e1 100644 --- a/include/llvm/Target/TargetRegisterInfo.h +++ b/include/llvm/Target/TargetRegisterInfo.h @@ -502,6 +502,15 @@ public: getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned Idx) const; + // For a copy-like instruction that defines a register of class DefRC with + // subreg index DefSubReg, reading from another source with class SrcRC and + // subregister SrcSubReg return true if this is a preferrable copy + // instruction or an earlier use should be used. + virtual bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const; + /// getSubClassWithSubReg - Returns the largest legal sub-class of RC that /// supports the sub-register index Idx. /// If no such sub-class exists, return NULL. diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp index 0122e6e9e38..4ad7041d329 100644 --- a/lib/CodeGen/PeepholeOptimizer.cpp +++ b/lib/CodeGen/PeepholeOptimizer.cpp @@ -577,36 +577,6 @@ bool PeepholeOptimizer::optimizeCondBranch(MachineInstr *MI) { return TII->optimizeCondBranch(MI); } -/// \brief Check if the registers defined by the pair (RegisterClass, SubReg) -/// share the same register file. -static bool shareSameRegisterFile(const TargetRegisterInfo &TRI, - const TargetRegisterClass *DefRC, - unsigned DefSubReg, - const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) { - // Same register class. - if (DefRC == SrcRC) - return true; - - // Both operands are sub registers. Check if they share a register class. - unsigned SrcIdx, DefIdx; - if (SrcSubReg && DefSubReg) - return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg, - SrcIdx, DefIdx) != nullptr; - // At most one of the register is a sub register, make it Src to avoid - // duplicating the test. - if (!SrcSubReg) { - std::swap(DefSubReg, SrcSubReg); - std::swap(DefRC, SrcRC); - } - - // One of the register is a sub register, check if we can get a superclass. - if (SrcSubReg) - return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr; - // Plain copy. - return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr; -} - /// \brief Try to find the next source that share the same register file /// for the value defined by \p Reg and \p SubReg. /// When true is returned, the \p RewriteMap can be used by the client to @@ -687,10 +657,8 @@ bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg, return false; const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg); - - // If this source does not incur a cross register bank copy, use it. - ShouldRewrite = shareSameRegisterFile(*TRI, DefRC, SubReg, SrcRC, - CurSrcPair.SubReg); + ShouldRewrite = TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC, + CurSrcPair.SubReg); } while (!ShouldRewrite); // Continue looking for new sources... diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp index 2532eaf2ca0..baa62a216ea 100644 --- a/lib/CodeGen/TargetRegisterInfo.cpp +++ b/lib/CodeGen/TargetRegisterInfo.cpp @@ -265,6 +265,47 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, return BestRC; } +/// \brief Check if the registers defined by the pair (RegisterClass, SubReg) +/// share the same register file. +static bool shareSameRegisterFile(const TargetRegisterInfo &TRI, + const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) { + // Same register class. + if (DefRC == SrcRC) + return true; + + // Both operands are sub registers. Check if they share a register class. + unsigned SrcIdx, DefIdx; + if (SrcSubReg && DefSubReg) { + return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg, + SrcIdx, DefIdx) != nullptr; + } + + // At most one of the register is a sub register, make it Src to avoid + // duplicating the test. + if (!SrcSubReg) { + std::swap(DefSubReg, SrcSubReg); + std::swap(DefRC, SrcRC); + } + + // One of the register is a sub register, check if we can get a superclass. + if (SrcSubReg) + return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr; + + // Plain copy. + return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr; +} + +bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // If this source does not incur a cross register bank copy, use it. + return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg); +} + // Compute target-independent register allocator hints to help eliminate copies. void TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg, diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index c89948bb3c2..380590fa920 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -392,6 +392,30 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( } } +bool SIRegisterInfo::shouldRewriteCopySrc( + const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // We want to prefer the smallest register class possible, so we don't want to + // stop and rewrite on anything that looks like a subregister + // extract. Operations mostly don't care about the super register class, so we + // only want to stop on the most basic of copies between the smae register + // class. + // + // e.g. if we have something like + // vreg0 = ... + // vreg1 = ... + // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2 + // vreg3 = COPY vreg2, sub0 + // + // We want to look through the COPY to find: + // => vreg3 = COPY vreg0 + + // Plain copy. + return getCommonSubClass(DefRC, SrcRC) != nullptr; +} + unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 84253bf47b6..a70d086cb52 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -75,6 +75,11 @@ public: const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, unsigned SubIdx) const; + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const override; + /// \p Channel This is the register channel (e.g. a value from 0-16), not the /// SubReg index. /// \returns The sub-register of Reg that is in Channel. diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll index 8c7c1bc3626..83f473bacad 100644 --- a/test/CodeGen/AMDGPU/and.ll +++ b/test/CodeGen/AMDGPU/and.ll @@ -147,11 +147,24 @@ endif: ret void } -; FIXME: and 0 should be replaced witht copy ; FUNC-LABEL: {{^}}v_and_constant_i64: +; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207 +; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}} +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], {{v[0-9]+}} +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], {{v[0-9]+}} +; SI: buffer_store_dwordx2 +define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %and = and i64 %a, 1231231234567 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: Should replace and 0 +; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant: ; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} -define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { +define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll index 842c2d8bc33..6b95c031d04 100644 --- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -35,14 +35,11 @@ define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 { ret void } -; FIXME: Shuffling to new superregister ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Y:[0-9]+]]:[[REG_X:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} -; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Y:[0-9]+]], v[[REG_Y]] -; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Z:[0-9]+]], v[[REG_Z]] -; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[COPY_REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[COPY_REG_Y]] +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]] ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD2]] ; CI: s_endpgm @@ -64,11 +61,15 @@ define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 { ret void } + +; FIXME: the v_lshl_b64 x, x, 32 is a bad way of doing a copy + ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}} +; CI: v_lshr_b64 v{{\[}}[[Y_COPY:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[REG_X]]:[[REG_Y]]{{\]}}, 32 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[Y_COPY]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD1]] ; CI: s_endpgm define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { @@ -140,13 +141,21 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { ; CI-LABEL: {{^}}simple_read2_v16f32_superreg: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI-NOT: v_mov_b32 ; CI: s_waitcnt lgkmcnt(0) ; CI: buffer_store_dword diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll index cd0b1b4e68d..4a3ea5e4a7e 100644 --- a/test/CodeGen/AMDGPU/half.ll +++ b/test/CodeGen/AMDGPU/half.ll @@ -391,13 +391,12 @@ define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 ret void } -; FIXME: Shouldn't do 4th conversion ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: ; GCN: buffer_load_dwordx4 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 +; GCN-NOT: v_cvt_f16_f32_e32 ; GCN: buffer_store_short ; GCN: buffer_store_dword ; GCN: s_endpgm @@ -476,38 +475,38 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 ; GCN: buffer_load_dword ; GCN: buffer_load_dword ; GCN: buffer_load_dword -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short ; GCN: s_endpgm define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { %val = load <16 x float>, <16 x float> addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll index 3d0f57e3328..6b365dc09e2 100644 --- a/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -21,12 +21,9 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 { ; SI-DAG: v_cmp_eq_i32 ; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff -; SI-DAG: v_cmp_gt_i32_e64 +; SI-DAG: v_cmp_gt_i32_e32 ; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] -; SI-DAG: v_cmp_gt_i32_e64 - - ; SI: buffer_store_dwordx2 ; SI: s_endpgm define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { diff --git a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll new file mode 100644 index 00000000000..f4fa6211210 --- /dev/null +++ b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCN %s + +; Check that when mubuf addr64 instruction is handled in moveToVALU +; from the pointer, dead register writes are not emitted. + +; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32 + +; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: +; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} + +; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] +; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN-NOT: v_mov_b32 + +; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] +; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] +; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, + +define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { +bb: + %tmp = icmp sgt i32 %arg3, 0 + br i1 %tmp, label %bb4, label %bb17 + +bb4: + %tmp14 = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %ptrarg + %tmp15 = getelementptr inbounds i8, i8 addrspace(1)* %tmp14, i64 %arg1 + %tmp16 = load volatile i8, i8 addrspace(1)* %tmp15 + br label %bb17 + +bb17: + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll index 9491c15aef5..fc171889f5f 100644 --- a/test/CodeGen/ARM/vcombine.ll +++ b/test/CodeGen/ARM/vcombine.ll @@ -2,11 +2,15 @@ ; RUN: llc -mtriple=armeb-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -; CHECK: vcombine8 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 -; CHECK-BE: vmov r1, r0, d16 -; CHECK-BE: vmov r3, r2, d17 +; CHECK-LABEL: vcombine8 +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE-DAG: vmov r0, r1, [[LD0]] +; CHECK-LE-DAG: vmov r2, r3, [[LD1]] + +; CHECK-BE-DAG: vmov r1, r0, d16 +; CHECK-BE-DAG: vmov r3, r2, d17 %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> @@ -14,11 +18,15 @@ define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -; CHECK: vcombine16 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 -; CHECK-BE: vmov r1, r0, d16 -; CHECK-BE: vmov r3, r2, d17 +; CHECK-LABEL: vcombine16 +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE-DAG: vmov r0, r1, [[LD0]] +; CHECK-LE-DAG: vmov r2, r3, [[LD1]] + +; CHECK-BE-DAG: vmov r1, r0, d16 +; CHECK-BE-DAG: vmov r3, r2, d17 %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> @@ -26,9 +34,14 @@ define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -; CHECK: vcombine32 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 +; CHECK-LABEL: vcombine32 + +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE: vmov r0, r1, [[LD0]] +; CHECK-LE: vmov r2, r3, [[LD1]] + ; CHECK-BE: vmov r1, r0, d16 ; CHECK-BE: vmov r3, r2, d17 %tmp1 = load <2 x i32>, <2 x i32>* %A @@ -38,9 +51,14 @@ define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind { -; CHECK: vcombinefloat -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 +; CHECK-LABEL: vcombinefloat + +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE: vmov r0, r1, [[LD0]] +; CHECK-LE: vmov r2, r3, [[LD1]] + ; CHECK-BE: vmov r1, r0, d16 ; CHECK-BE: vmov r3, r2, d17 %tmp1 = load <2 x float>, <2 x float>* %A @@ -50,11 +68,15 @@ define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind { } define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -; CHECK: vcombine64 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 -; CHECK-BE: vmov r1, r0, d16 -; CHECK-BE: vmov r3, r2, d17 +; CHECK-LABEL: vcombine64 +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE: vmov r0, r1, [[LD0]] +; CHECK-LE: vmov r2, r3, [[LD1]] + +; CHECK-BE: vmov r1, r0, [[LD0]] +; CHECK-BE: vmov r3, r2, [[LD1]] %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B %tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll index 89af2318bfb..36bcde22731 100644 --- a/test/CodeGen/ARM/vtrn.ll +++ b/test/CodeGen/ARM/vtrn.ll @@ -20,11 +20,11 @@ define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { define <16 x i8> @vtrni8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vtrni8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B @@ -52,11 +52,11 @@ define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { define <8 x i16> @vtrni16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vtrni16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.16 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vtrn.16 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -84,11 +84,11 @@ define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <4 x i32> @vtrni32_Qres(<2 x i32>* %A, <2 x i32>* %B) nounwind { ; CHECK-LABEL: vtrni32_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.32 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -116,11 +116,11 @@ define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind { define <4 x float> @vtrnf_Qres(<2 x float>* %A, <2 x float>* %B) nounwind { ; CHECK-LABEL: vtrnf_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.32 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B @@ -281,11 +281,11 @@ define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { define <16 x i8> @vtrni8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vtrni8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll index 6c24348b8cf..04499e77fde 100644 --- a/test/CodeGen/ARM/vuzp.ll +++ b/test/CodeGen/ARM/vuzp.ll @@ -20,11 +20,11 @@ define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vuzpi8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B @@ -52,11 +52,11 @@ define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vuzpi16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vuzp.16 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vuzp.16 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -220,11 +220,11 @@ define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vuzpi8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B diff --git a/test/CodeGen/ARM/vzip.ll b/test/CodeGen/ARM/vzip.ll index 29385d0cc9d..259b484f5f8 100644 --- a/test/CodeGen/ARM/vzip.ll +++ b/test/CodeGen/ARM/vzip.ll @@ -20,11 +20,11 @@ define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { define <16 x i8> @vzipi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vzipi8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vzip.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vzip.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B @@ -52,11 +52,11 @@ define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { define <8 x i16> @vzipi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vzipi16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vzip.16 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vzip.16 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -220,11 +220,11 @@ define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { define <16 x i8> @vzipi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vzipi8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vzip.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vzip.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B