[ARM] Fix ReconstructShuffle for bigendian

Simon pointed out that this function is doing a bitcast, which can be incorrect for big endian. That makes the lowering of VMOVN in MVE wrong, but the function is shared between Neon and MVE so both can be incorrect. This attempts to fix things by using the newly added VECTOR_REG_CAST instead of the BITCAST. As it may now be used on Neon, I've added the relevant patterns for it there too. I've also added a quick dag combine for it to remove them where possible. Differential Revision: https://reviews.llvm.org/D74485
2025-01-31 20:51:52 +01:00 · 2020-02-12 20:51:39 +00:00 · 2020-02-12 20:51:39 +00:00 · 61eab1d530
commit 61eab1d530
parent aa89daf1c2
6 changed files with 62 additions and 28 deletions
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@ -7526,7 +7526,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
    if (SrcEltTy == SmallestEltTy)
      continue;
    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
-    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
    Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
    Src.WindowBase *= Src.WindowScale;
  }
@ -7578,7 +7578,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
                                            ShuffleOps[1], Mask, DAG);
  if (!Shuffle)
    return SDValue();
-  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
 }

 enum ShuffleOpCodes {
@ -12952,6 +12952,28 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  return SDValue();
 }

+static SDValue
+PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                              const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  SDLoc dl(N);
+
+  // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
+  if (ST->isLittle())
+    return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+  // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
+  if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
+    // If the valuetypes are the same, we can remove the cast entirely.
+    if (Op->getOperand(0).getValueType() == VT)
+      return Op->getOperand(0);
+    return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformVCMPCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
@ -14787,6 +14809,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
    return PerformARMBUILD_VECTORCombine(N, DCI);
  case ARMISD::PREDICATE_CAST:
    return PerformPREDICATE_CASTCombine(N, DCI);
+  case ARMISD::VECTOR_REG_CAST:
+    return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
  case ARMISD::VCMP:
    return PerformVCMPCombine(N, DCI, Subtarget);
  case ARMISD::SMULWB: {
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@ -294,6 +294,21 @@ def ARMWLS      : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
 def ARMLE       : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
 def ARMLoopDec  : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;

+// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
+// vector register as a different vector type, without changing the contents of
+// the register. It differs from 'bitconvert' in that bitconvert reinterprets
+// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST
+// reinterprets the _register_ format - and in big-endian, the memory and
+// register formats are different, so they are different operations.
+//
+// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of
+// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
+// whereas 'bitconvert' will map it to the high byte in big-endian mode,
+// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the
+// bitconvert would have to emit a VREV16.8 instruction, whereas the
+// VECTOR_REG_CAST emits no code at all if the vector is already in a register.
+def ARMVectorRegCast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
+
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.

--- a/lib/Target/ARM/ARMInstrMVE.td
+++ b/lib/Target/ARM/ARMInstrMVE.td
@ -3990,21 +3990,6 @@ let Predicates = [HasMVEInt] in {
 // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
 def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;

-// 'vector_reg_cast' is an operation that reinterprets the contents of an MVE
-// vector register as a different vector type, without changing the contents of
-// the register. It differs from 'bitconvert' in that bitconvert reinterprets
-// the _memory_ storage format of the vector, whereas vector_reg_cast
-// reinterprets the _register_ format - and in big-endian, the memory and
-// register formats are different, so they are different operations.
-//
-// For example, 'vector_reg_cast' between v8i16 and v16i8 will map the LSB of
-// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
-// whereas 'bitconvert' will map it to the high byte in big-endian mode,
-// because that's what VSTRH.16 followed by VLDRB.8 would do. So the bitconvert
-// would have to emit a VREV16.8 instruction, whereas the vector_reg_cast emits
-// no code at all if the vector is already in a register.
-def vector_reg_cast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
-
 let Predicates = [HasMVEInt] in {
  foreach VT = [ v4i1, v8i1, v16i1 ] in {
    def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
@ -4019,7 +4004,7 @@ let Predicates = [HasMVEInt] in {

  foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
    foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
-      def : Pat<(VT (vector_reg_cast (VT2 MQPR:$src))), (VT MQPR:$src)>;
+      def : Pat<(VT (ARMVectorRegCast (VT2 MQPR:$src))), (VT MQPR:$src)>;
 }

 // end of MVE compares
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@ -7531,6 +7531,16 @@ let Predicates = [IsBE,HasNEON] in {
  def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8  QPR:$src)>;
 }

+let Predicates = [HasNEON] in {
+  foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+    foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+      def : Pat<(VT (ARMVectorRegCast (VT2 QPR:$src))), (VT QPR:$src)>;
+
+  foreach VT = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
+    foreach VT2 = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
+      def : Pat<(VT (ARMVectorRegCast (VT2 DPR:$src))), (VT DPR:$src)>;
+}
+
 // Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
 let Predicates = [IsBE,HasNEON] in {
 def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
--- a/test/CodeGen/ARM/neon-vmovn.ll
+++ b/test/CodeGen/ARM/neon-vmovn.ll
@ -732,8 +732,8 @@ define arm_aapcs_vfpcc <16 x i8> @test(<8 x i16> %src1, <8 x i16> %src2) {
 ;
 ; CHECKBE-LABEL: test:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q8, q1
-; CHECKBE-NEXT:    vrev64.8 q9, q0
+; CHECKBE-NEXT:    vrev64.16 q8, q1
+; CHECKBE-NEXT:    vrev64.16 q9, q0
 ; CHECKBE-NEXT:    vtrn.8 q9, q8
 ; CHECKBE-NEXT:    vrev64.8 q0, q9
 ; CHECKBE-NEXT:    bx lr
--- a/test/CodeGen/Thumb2/mve-vmovn.ll
+++ b/test/CodeGen/Thumb2/mve-vmovn.ll
@ -10,8 +10,8 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src
 ;
 ; CHECKBE-LABEL: vmovn32_trunc1:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q2, q1
-; CHECKBE-NEXT:    vrev64.16 q1, q0
+; CHECKBE-NEXT:    vrev64.32 q2, q1
+; CHECKBE-NEXT:    vrev64.32 q1, q0
 ; CHECKBE-NEXT:    vmovnt.i32 q1, q2
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
@ -30,8 +30,8 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src
 ;
 ; CHECKBE-LABEL: vmovn32_trunc2:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q2, q0
-; CHECKBE-NEXT:    vrev64.16 q3, q1
+; CHECKBE-NEXT:    vrev64.32 q2, q0
+; CHECKBE-NEXT:    vrev64.32 q3, q1
 ; CHECKBE-NEXT:    vmovnt.i32 q3, q2
 ; CHECKBE-NEXT:    vrev64.16 q0, q3
 ; CHECKBE-NEXT:    bx lr
@ -49,8 +49,8 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src
 ;
 ; CHECKBE-LABEL: vmovn16_trunc1:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q2, q1
-; CHECKBE-NEXT:    vrev64.8 q1, q0
+; CHECKBE-NEXT:    vrev64.16 q2, q1
+; CHECKBE-NEXT:    vrev64.16 q1, q0
 ; CHECKBE-NEXT:    vmovnt.i16 q1, q2
 ; CHECKBE-NEXT:    vrev64.8 q0, q1
 ; CHECKBE-NEXT:    bx lr
@ -69,8 +69,8 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src
 ;
 ; CHECKBE-LABEL: vmovn16_trunc2:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q2, q0
-; CHECKBE-NEXT:    vrev64.8 q3, q1
+; CHECKBE-NEXT:    vrev64.16 q2, q0
+; CHECKBE-NEXT:    vrev64.16 q3, q1
 ; CHECKBE-NEXT:    vmovnt.i16 q3, q2
 ; CHECKBE-NEXT:    vrev64.8 q0, q3
 ; CHECKBE-NEXT:    bx lr