1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 02:33:06 +01:00

[AArch64][SVE] Improve code generation for vector_splice for Imm == -1

This patch implements vector_splice in tablegen for:
  a) when the immediate is equal to -1 (Imm==1) and uses:
       INSR  +  LASTB
For instance :
@llvm.experimental.vector.splice(Vector_1, Vector_2, -1)
@llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, 1) ==> <D, E, F, G>
    LAST   RegLast, Vector_1                 // RegLast = D
    INSR   Res, (Vector_1 >> 1), RegLast     // Res = D + E, F, G

Differential Revision: https://reviews.llvm.org/D105633
This commit is contained in:
Caroline Concatto 2021-07-06 09:19:16 +01:00
parent 96fce2fe63
commit d9b910d9d2
4 changed files with 189 additions and 127 deletions

View File

@ -895,6 +895,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::VECTOR_SPLICE);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
@ -1159,6 +1160,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MULHS, VT, Custom);
setOperationAction(ISD::MULHU, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::SDIV, VT, Custom);
@ -1282,6 +1284,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
}
@ -1559,6 +1562,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
setOperationAction(ISD::SMAX, VT, Custom);
setOperationAction(ISD::SMIN, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
@ -4911,6 +4915,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
/*OverrideNEON=*/true);
case ISD::CTTZ:
return LowerCTTZ(Op, DAG);
case ISD::VECTOR_SPLICE:
return LowerVECTOR_SPLICE(Op, DAG);
}
}
@ -7417,6 +7423,14 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
return CS1;
}
SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
SelectionDAG &DAG) const {
if (Op.getConstantOperandAPInt(2) == -1)
return Op;
return SDValue();
}
SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@ -16487,6 +16501,28 @@ performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return performPostLD1Combine(N, DCI, true);
}
SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
EVT Ty = N->getValueType(0);
if (Ty.isInteger())
return SDValue();
EVT IntTy = Ty.changeVectorElementTypeToInteger();
EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
IntTy.getVectorElementType().getScalarSizeInBits())
return SDValue();
SDLoc DL(N);
SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
DL, ExtIntTy);
SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
DL, ExtIntTy);
SDValue Idx = N->getOperand(2);
SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
return DAG.getBitcast(Ty, Trunc);
}
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@ -16539,6 +16575,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
break;
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
case ISD::VECTOR_SPLICE:
return performSVESpliceCombine(N, DAG);
case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
case AArch64ISD::TBNZ:

View File

@ -948,6 +948,7 @@ private:
SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp,
bool OverrideNEON = false) const;
SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;

View File

@ -1226,6 +1226,20 @@ let Predicates = [HasSVE] in {
def : Pat<(nxv8bf16 (concat_vectors nxv4bf16:$v1, nxv4bf16:$v2)),
(UZP1_ZZZ_H $v1, $v2)>;
// Splice with lane equal to -1
def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 -1))),
(INSR_ZV_B ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
(LASTB_VPZ_B (PTRUE_B 31), ZPR:$Z1), bsub))>;
def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 -1))),
(INSR_ZV_H ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
(LASTB_VPZ_H (PTRUE_H 31), ZPR:$Z1), hsub))>;
def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 -1))),
(INSR_ZV_S ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
(LASTB_VPZ_S (PTRUE_S 31), ZPR:$Z1), ssub))>;
def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 -1))),
(INSR_ZV_D ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
(LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>;
defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;

View File

@ -239,6 +239,70 @@ define <vscale x 2 x i64> @splice_nxv2i64_clamped_idx(<vscale x 2 x i64> %a, <vs
ret <vscale x 2 x i64> %res
}
define <vscale x 2 x half> @splice_nxv2f16_neg_idx(<vscale x 2 x half> %a, <vscale x 2 x half> %b) #0 {
; CHECK-LABEL: splice_nxv2f16_neg_idx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: lastb d0, p0, z0.d
; CHECK-NEXT: insr z1.d, d0
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
ret <vscale x 2 x half> %res
}
define <vscale x 2 x half> @splice_nxv2f16_neg2_idx(<vscale x 2 x half> %a, <vscale x 2 x half> %b) #0 {
; CHECK-LABEL: splice_nxv2f16_neg2_idx:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-16
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -2)
ret <vscale x 2 x half> %res
}
define <vscale x 4 x half> @splice_nxv4f16_neg_idx(<vscale x 4 x half> %a, <vscale x 4 x half> %b) #0 {
; CHECK-LABEL: splice_nxv4f16_neg_idx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lastb s0, p0, z0.s
; CHECK-NEXT: insr z1.s, s0
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
ret <vscale x 4 x half> %res
}
define <vscale x 4 x half> @splice_nxv4f16_neg3_idx(<vscale x 4 x half> %a, <vscale x 4 x half> %b) #0 {
; CHECK-LABEL: splice_nxv4f16_neg3_idx:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-12
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -3)
ret <vscale x 4 x half> %res
}
define <vscale x 8 x half> @splice_nxv8f16_first_idx(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
; CHECK-LABEL: splice_nxv8f16_first_idx:
; CHECK: // %bb.0:
@ -297,6 +361,38 @@ define <vscale x 8 x half> @splice_nxv8f16_clamped_idx(<vscale x 8 x half> %a, <
ret <vscale x 8 x half> %res
}
define <vscale x 2 x float> @splice_nxv2f32_neg_idx(<vscale x 2 x float> %a, <vscale x 2 x float> %b) #0 {
; CHECK-LABEL: splice_nxv2f32_neg_idx:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: lastb d0, p0, z0.d
; CHECK-NEXT: insr z1.d, d0
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
ret <vscale x 2 x float> %res
}
define <vscale x 2 x float> @splice_nxv2f32_neg2_idx(<vscale x 2 x float> %a, <vscale x 2 x float> %b) #0 {
; CHECK-LABEL: splice_nxv2f32_neg2_idx:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-16
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -2)
ret <vscale x 2 x float> %res
}
define <vscale x 4 x float> @splice_nxv4f32_first_idx(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
; CHECK-LABEL: splice_nxv4f32_first_idx:
; CHECK: // %bb.0:
@ -605,17 +701,10 @@ define <vscale x 16 x i8> @splice_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x
define <vscale x 16 x i8> @splice_nxv16i8_1(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
; CHECK-LABEL: splice_nxv16i8_1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-1
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: lastb b0, p0, z0.b
; CHECK-NEXT: insr z1.b, b0
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
ret <vscale x 16 x i8> %res
@ -668,18 +757,10 @@ define <vscale x 8 x i16> @splice_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i
define <vscale x 8 x i16> @splice_nxv8i16_1(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
; CHECK-LABEL: splice_nxv8i16_1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-2
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: lastb h0, p0, z0.h
; CHECK-NEXT: insr z1.h, h0
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
ret <vscale x 8 x i16> %res
@ -732,18 +813,10 @@ define <vscale x 4 x i32> @splice_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i
define <vscale x 4 x i32> @splice_nxv4i32_1(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
; CHECK-LABEL: splice_nxv4i32_1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-4
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: lastb s0, p0, z0.s
; CHECK-NEXT: insr z1.s, s0
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
ret <vscale x 4 x i32> %res
@ -796,18 +869,10 @@ define <vscale x 2 x i64> @splice_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i
define <vscale x 2 x i64> @splice_nxv2i64_1(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
; CHECK-LABEL: splice_nxv2i64_1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-8
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: lastb d0, p0, z0.d
; CHECK-NEXT: insr z1.d, d0
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
ret <vscale x 2 x i64> %res
@ -860,18 +925,10 @@ define <vscale x 8 x half> @splice_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x
define <vscale x 8 x half> @splice_nxv8f16_1(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
; CHECK-LABEL: splice_nxv8f16_1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-2
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: lastb h0, p0, z0.h
; CHECK-NEXT: insr z1.h, h0
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
ret <vscale x 8 x half> %res
@ -924,18 +981,10 @@ define <vscale x 4 x float> @splice_nxv4f32(<vscale x 4 x float> %a, <vscale x 4
define <vscale x 4 x float> @splice_nxv4f32_1(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
; CHECK-LABEL: splice_nxv4f32_1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-4
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: lastb s0, p0, z0.s
; CHECK-NEXT: insr z1.s, s0
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
ret <vscale x 4 x float> %res
@ -988,18 +1037,10 @@ define <vscale x 2 x double> @splice_nxv2f64(<vscale x 2 x double> %a, <vscale x
define <vscale x 2 x double> @splice_nxv2f64_1(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
; CHECK-LABEL: splice_nxv2f64_1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-8
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: lastb d0, p0, z0.d
; CHECK-NEXT: insr z1.d, d0
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
ret <vscale x 2 x double> %res
@ -1033,22 +1074,13 @@ define <vscale x 2 x double> @splice_nxv2f64_clamped(<vscale x 2 x double> %a, <
define <vscale x 2 x i1> @splice_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
; CHECK-LABEL: splice_nxv2i1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: lastb d0, p0, z0.d
; CHECK-NEXT: mov z1.d, p1/z, #1 // =0x1
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-8
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: and z0.d, z0.d, #0x1
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: insr z1.d, d0
; CHECK-NEXT: and z1.d, z1.d, #0x1
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
; CHECK-NEXT: ret
%res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
ret <vscale x 2 x i1> %res
@ -1058,22 +1090,13 @@ define <vscale x 2 x i1> @splice_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1>
define <vscale x 4 x i1> @splice_nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
; CHECK-LABEL: splice_nxv4i1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lastb s0, p0, z0.s
; CHECK-NEXT: mov z1.s, p1/z, #1 // =0x1
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-4
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: and z0.s, z0.s, #0x1
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: insr z1.s, s0
; CHECK-NEXT: and z1.s, z1.s, #0x1
; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0
; CHECK-NEXT: ret
%res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
ret <vscale x 4 x i1> %res
@ -1083,22 +1106,13 @@ define <vscale x 4 x i1> @splice_nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1>
define <vscale x 8 x i1> @splice_nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
; CHECK-LABEL: splice_nxv8i1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: lastb h0, p0, z0.h
; CHECK-NEXT: mov z1.h, p1/z, #1 // =0x1
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-2
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9]
; CHECK-NEXT: and z0.h, z0.h, #0x1
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: insr z1.h, h0
; CHECK-NEXT: and z1.h, z1.h, #0x1
; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0
; CHECK-NEXT: ret
%res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
ret <vscale x 8 x i1> %res
@ -1108,21 +1122,13 @@ define <vscale x 8 x i1> @splice_nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1>
define <vscale x 16 x i1> @splice_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
; CHECK-LABEL: splice_nxv16i1:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: lastb b0, p0, z0.b
; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl]
; CHECK-NEXT: addvl x8, x8, #1
; CHECK-NEXT: mov x9, #-1
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9]
; CHECK-NEXT: and z0.b, z0.b, #0x1
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: insr z1.b, b0
; CHECK-NEXT: and z1.b, z1.b, #0x1
; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0
; CHECK-NEXT: ret
%res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
ret <vscale x 16 x i1> %res
@ -1217,7 +1223,10 @@ declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8
declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)