From c55d376d9d8a591578e2377526cf0648a3b21709 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 27 Jan 2020 13:59:29 +0000 Subject: [PATCH] [MVE] Fixup order of gather writeback intrinsic outputs The MVE_VLDRWU32_qi_pre gather loads, like the other _pre/_post mve loads returns the writeback as result 0, the value as result 1. The llvm ir intrinsic seems to have this the other way around though, and so when lowering from one to the other we need to switch the first two outputs. I've also fixed up the types of _pre/_post on normal MVE loads. There we were already getting the values the right way around, just not for the types. I don't believe this was causing anything to go wrong, but it was very confusing to read in the debug output. Differential Revision: https://reviews.llvm.org/D73370 --- lib/Target/ARM/ARMISelDAGToDAG.cpp | 15 +++++-- .../Thumb2/mve-intrinsics/scatter-gather.ll | 40 +++++++++---------- test/CodeGen/Thumb2/mve-intrinsics/vldr.ll | 12 +++--- 3 files changed, 38 insertions(+), 29 deletions(-) diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 8fd55d2a0dd..b628d19b5a9 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1791,8 +1791,8 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { SDValue Ops[] = {Base, NewOffset, CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg, Chain}; - SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), N->getValueType(0), - MVT::i32, MVT::Other, Ops); + SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, + N->getValueType(0), MVT::Other, Ops); transferMemOperands(N, New); ReplaceUses(SDValue(N, 0), SDValue(New, 1)); ReplaceUses(SDValue(N, 1), SDValue(New, 0)); @@ -2514,7 +2514,16 @@ void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes, Ops.push_back(N->getOperand(0)); // chain - CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); + SmallVector VTs; + VTs.push_back(N->getValueType(1)); + VTs.push_back(N->getValueType(0)); + VTs.push_back(N->getValueType(2)); + + SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), VTs, Ops); + ReplaceUses(SDValue(N, 0), SDValue(New, 1)); + ReplaceUses(SDValue(N, 1), SDValue(New, 0)); + ReplaceUses(SDValue(N, 2), SDValue(New, 2)); + CurDAG->RemoveDeadNode(N); } void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode, diff --git a/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll b/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll index 7eac79094f5..a25dd721d54 100644 --- a/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll +++ b/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll @@ -202,8 +202,8 @@ entry: define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_s64(<2 x i64>* %addr) { ; CHECK-LABEL: test_vldrdq_gather_base_wb_s64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrd.u64 q1, [q0, #576]! +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrd.u64 q0, [q1, #576]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -220,8 +220,8 @@ declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(< define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_u64(<2 x i64>* %addr) { ; CHECK-LABEL: test_vldrdq_gather_base_wb_u64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrd.u64 q1, [q0, #-328]! +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrd.u64 q0, [q1, #-328]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -237,9 +237,9 @@ define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_s64(<2 x i64>* %a ; CHECK-LABEL: test_vldrdq_gather_base_wb_z_s64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrdt.u64 q1, [q0, #664]! +; CHECK-NEXT: vldrdt.u64 q0, [q1, #664]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -259,9 +259,9 @@ define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_u64(<2 x i64>* %a ; CHECK-LABEL: test_vldrdq_gather_base_wb_z_u64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrdt.u64 q1, [q0, #656]! +; CHECK-NEXT: vldrdt.u64 q0, [q1, #656]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -727,8 +727,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_f32(<4 x i32>* %addr) { ; CHECK-LABEL: test_vldrwq_gather_base_wb_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [q0, #-64]! +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1, #-64]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -745,8 +745,8 @@ declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32 define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_s32(<4 x i32>* %addr) { ; CHECK-LABEL: test_vldrwq_gather_base_wb_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [q0, #80]! +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1, #80]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -763,8 +763,8 @@ declare { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(< define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_u32(<4 x i32>* %addr) { ; CHECK-LABEL: test_vldrwq_gather_base_wb_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [q0, #480]! +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1, #480]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -780,9 +780,9 @@ define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_z_f32(<4 x i32>* ; CHECK-LABEL: test_vldrwq_gather_base_wb_z_f32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [q0, #-352]! +; CHECK-NEXT: vldrwt.u32 q0, [q1, #-352]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -802,9 +802,9 @@ define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_z_s32(<4 x i32>* %a ; CHECK-LABEL: test_vldrwq_gather_base_wb_z_s32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [q0, #276]! +; CHECK-NEXT: vldrwt.u32 q0, [q1, #276]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -824,9 +824,9 @@ define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_z_u32(<4 x i32>* %a ; CHECK-LABEL: test_vldrwq_gather_base_wb_z_u32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [q0, #88]! +; CHECK-NEXT: vldrwt.u32 q0, [q1, #88]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: diff --git a/test/CodeGen/Thumb2/mve-intrinsics/vldr.ll b/test/CodeGen/Thumb2/mve-intrinsics/vldr.ll index aa893abc057..a5490e1fc39 100644 --- a/test/CodeGen/Thumb2/mve-intrinsics/vldr.ll +++ b/test/CodeGen/Thumb2/mve-intrinsics/vldr.ll @@ -4,8 +4,8 @@ define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_s32(<4 x i32>* %addr) { ; CHECK-LABEL: test_vldrwq_gather_base_wb_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [q0, #80]! +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1, #80]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -22,8 +22,8 @@ declare { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(< define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_f32(<4 x i32>* %addr) { ; CHECK-LABEL: test_vldrwq_gather_base_wb_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [q0, #64]! +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1, #64]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: @@ -41,9 +41,9 @@ define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_u64(<2 x i64>* %a ; CHECK-LABEL: test_vldrdq_gather_base_wb_z_u64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrdt.u64 q1, [q0, #656]! +; CHECK-NEXT: vldrdt.u64 q0, [q1, #656]! ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: