1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[ARM] Fix ReconstructShuffle for bigendian

Simon pointed out that this function is doing a bitcast, which can be
incorrect for big endian. That makes the lowering of VMOVN in MVE
wrong, but the function is shared between Neon and MVE so both can
be incorrect.

This attempts to fix things by using the newly added VECTOR_REG_CAST
instead of the BITCAST. As it may now be used on Neon, I've added the
relevant patterns for it there too. I've also added a quick dag combine
for it to remove them where possible.

Differential Revision: https://reviews.llvm.org/D74485
This commit is contained in:
David Green 2020-02-12 20:51:39 +00:00
parent aa89daf1c2
commit 61eab1d530
6 changed files with 62 additions and 28 deletions

View File

@ -7526,7 +7526,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
if (SrcEltTy == SmallestEltTy)
continue;
assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
Src.WindowBase *= Src.WindowScale;
}
@ -7578,7 +7578,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
ShuffleOps[1], Mask, DAG);
if (!Shuffle)
return SDValue();
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
}
enum ShuffleOpCodes {
@ -12952,6 +12952,28 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return SDValue();
}
static SDValue
PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
SDValue Op = N->getOperand(0);
SDLoc dl(N);
// Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
if (ST->isLittle())
return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);
// VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
// If the valuetypes are the same, we can remove the cast entirely.
if (Op->getOperand(0).getValueType() == VT)
return Op->getOperand(0);
return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
}
return SDValue();
}
static SDValue PerformVCMPCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
@ -14787,6 +14809,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return PerformARMBUILD_VECTORCombine(N, DCI);
case ARMISD::PREDICATE_CAST:
return PerformPREDICATE_CASTCombine(N, DCI);
case ARMISD::VECTOR_REG_CAST:
return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
case ARMISD::VCMP:
return PerformVCMPCombine(N, DCI, Subtarget);
case ARMISD::SMULWB: {

View File

@ -294,6 +294,21 @@ def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
// vector register as a different vector type, without changing the contents of
// the register. It differs from 'bitconvert' in that bitconvert reinterprets
// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST
// reinterprets the _register_ format - and in big-endian, the memory and
// register formats are different, so they are different operations.
//
// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of
// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
// whereas 'bitconvert' will map it to the high byte in big-endian mode,
// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the
// bitconvert would have to emit a VREV16.8 instruction, whereas the
// VECTOR_REG_CAST emits no code at all if the vector is already in a register.
def ARMVectorRegCast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
//===----------------------------------------------------------------------===//
// ARM Flag Definitions.

View File

@ -3990,21 +3990,6 @@ let Predicates = [HasMVEInt] in {
// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
// 'vector_reg_cast' is an operation that reinterprets the contents of an MVE
// vector register as a different vector type, without changing the contents of
// the register. It differs from 'bitconvert' in that bitconvert reinterprets
// the _memory_ storage format of the vector, whereas vector_reg_cast
// reinterprets the _register_ format - and in big-endian, the memory and
// register formats are different, so they are different operations.
//
// For example, 'vector_reg_cast' between v8i16 and v16i8 will map the LSB of
// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
// whereas 'bitconvert' will map it to the high byte in big-endian mode,
// because that's what VSTRH.16 followed by VLDRB.8 would do. So the bitconvert
// would have to emit a VREV16.8 instruction, whereas the vector_reg_cast emits
// no code at all if the vector is already in a register.
def vector_reg_cast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
let Predicates = [HasMVEInt] in {
foreach VT = [ v4i1, v8i1, v16i1 ] in {
def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
@ -4019,7 +4004,7 @@ let Predicates = [HasMVEInt] in {
foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
def : Pat<(VT (vector_reg_cast (VT2 MQPR:$src))), (VT MQPR:$src)>;
def : Pat<(VT (ARMVectorRegCast (VT2 MQPR:$src))), (VT MQPR:$src)>;
}
// end of MVE compares

View File

@ -7531,6 +7531,16 @@ let Predicates = [IsBE,HasNEON] in {
def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>;
}
let Predicates = [HasNEON] in {
foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
def : Pat<(VT (ARMVectorRegCast (VT2 QPR:$src))), (VT QPR:$src)>;
foreach VT = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
foreach VT2 = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
def : Pat<(VT (ARMVectorRegCast (VT2 DPR:$src))), (VT DPR:$src)>;
}
// Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
let Predicates = [IsBE,HasNEON] in {
def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),

View File

@ -732,8 +732,8 @@ define arm_aapcs_vfpcc <16 x i8> @test(<8 x i16> %src1, <8 x i16> %src2) {
;
; CHECKBE-LABEL: test:
; CHECKBE: @ %bb.0: @ %entry
; CHECKBE-NEXT: vrev64.8 q8, q1
; CHECKBE-NEXT: vrev64.8 q9, q0
; CHECKBE-NEXT: vrev64.16 q8, q1
; CHECKBE-NEXT: vrev64.16 q9, q0
; CHECKBE-NEXT: vtrn.8 q9, q8
; CHECKBE-NEXT: vrev64.8 q0, q9
; CHECKBE-NEXT: bx lr

View File

@ -10,8 +10,8 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src
;
; CHECKBE-LABEL: vmovn32_trunc1:
; CHECKBE: @ %bb.0: @ %entry
; CHECKBE-NEXT: vrev64.16 q2, q1
; CHECKBE-NEXT: vrev64.16 q1, q0
; CHECKBE-NEXT: vrev64.32 q2, q1
; CHECKBE-NEXT: vrev64.32 q1, q0
; CHECKBE-NEXT: vmovnt.i32 q1, q2
; CHECKBE-NEXT: vrev64.16 q0, q1
; CHECKBE-NEXT: bx lr
@ -30,8 +30,8 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src
;
; CHECKBE-LABEL: vmovn32_trunc2:
; CHECKBE: @ %bb.0: @ %entry
; CHECKBE-NEXT: vrev64.16 q2, q0
; CHECKBE-NEXT: vrev64.16 q3, q1
; CHECKBE-NEXT: vrev64.32 q2, q0
; CHECKBE-NEXT: vrev64.32 q3, q1
; CHECKBE-NEXT: vmovnt.i32 q3, q2
; CHECKBE-NEXT: vrev64.16 q0, q3
; CHECKBE-NEXT: bx lr
@ -49,8 +49,8 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src
;
; CHECKBE-LABEL: vmovn16_trunc1:
; CHECKBE: @ %bb.0: @ %entry
; CHECKBE-NEXT: vrev64.8 q2, q1
; CHECKBE-NEXT: vrev64.8 q1, q0
; CHECKBE-NEXT: vrev64.16 q2, q1
; CHECKBE-NEXT: vrev64.16 q1, q0
; CHECKBE-NEXT: vmovnt.i16 q1, q2
; CHECKBE-NEXT: vrev64.8 q0, q1
; CHECKBE-NEXT: bx lr
@ -69,8 +69,8 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src
;
; CHECKBE-LABEL: vmovn16_trunc2:
; CHECKBE: @ %bb.0: @ %entry
; CHECKBE-NEXT: vrev64.8 q2, q0
; CHECKBE-NEXT: vrev64.8 q3, q1
; CHECKBE-NEXT: vrev64.16 q2, q0
; CHECKBE-NEXT: vrev64.16 q3, q1
; CHECKBE-NEXT: vmovnt.i16 q3, q2
; CHECKBE-NEXT: vrev64.8 q0, q3
; CHECKBE-NEXT: bx lr