From b2ba90aa1572e8017214155162b0d28c163f24cc Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 17 Feb 2021 15:57:59 +0000 Subject: [PATCH] [RISCV] Begin to support more subvector inserts/extracts This patch adds support for INSERT_SUBVECTOR and EXTRACT_SUBVECTOR (nominally where both operands are scalable vector types) where the vector, subvector, and index align sufficiently to allow decomposition to subregister manipulation: * For extracts, the extracted subvector must correctly align with the lower elements of a vector register. * For inserts, the inserted subvector must be at least one full vector register, and correctly align as above. This approach should work for fixed-length vector insertion/extraction too, but that will come later. Reviewed By: craig.topper, khchen, arcbbb Differential Revision: https://reviews.llvm.org/D96873 --- lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 199 +++++++++++++---- test/CodeGen/RISCV/rvv/extract-subvector.ll | 226 ++++++++++++++++++++ test/CodeGen/RISCV/rvv/insert-subvector.ll | 206 ++++++++++++++++++ 3 files changed, 588 insertions(+), 43 deletions(-) create mode 100644 test/CodeGen/RISCV/rvv/extract-subvector.ll create mode 100644 test/CodeGen/RISCV/rvv/insert-subvector.ll diff --git a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 2c82704428f..1c3d0cfc2fb 100644 --- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -382,6 +382,48 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned IntNo, ReplaceNode(Node, Store); } +static unsigned getRegClassIDForVecVT(MVT VT) { + if (VT.getVectorElementType() == MVT::i1) + return RISCV::VRRegClassID; + return getRegClassIDForLMUL(getLMUL(VT)); +} + +// Attempt to decompose a subvector insert/extract between VecVT and +// SubVecVT via subregister indices. Returns the subregister index that +// can perform the subvector insert/extract with the given element index, as +// well as the index corresponding to any leftover subvectors that must be +// further inserted/extracted within the register class for SubVecVT. +static std::pair +decomposeSubvectorInsertExtractToSubRegs(MVT VecVT, MVT SubVecVT, + unsigned InsertExtractIdx, + const RISCVRegisterInfo *TRI) { + static_assert((RISCV::VRM8RegClassID > RISCV::VRM4RegClassID && + RISCV::VRM4RegClassID > RISCV::VRM2RegClassID && + RISCV::VRM2RegClassID > RISCV::VRRegClassID), + "Register classes not ordered"); + unsigned VecRegClassID = getRegClassIDForVecVT(VecVT); + unsigned SubRegClassID = getRegClassIDForVecVT(SubVecVT); + // Try to compose a subregister index that takes us from the incoming + // LMUL>1 register class down to the outgoing one. At each step we half + // the LMUL: + // nxv16i32@12 -> nxv2i32: sub_vrm4_1_then_sub_vrm2_1_then_sub_vrm1_0 + // Note that this is not guaranteed to find a subregister index, such as + // when we are extracting from one VR type to another. + unsigned SubRegIdx = RISCV::NoSubRegister; + for (const unsigned RCID : + {RISCV::VRM4RegClassID, RISCV::VRM2RegClassID, RISCV::VRRegClassID}) + if (VecRegClassID > RCID && SubRegClassID <= RCID) { + VecVT = VecVT.getHalfNumVectorElementsVT(); + bool IsHi = + InsertExtractIdx >= VecVT.getVectorElementCount().getKnownMinValue(); + SubRegIdx = TRI->composeSubRegIndices(SubRegIdx, + getSubregIndexByMVT(VecVT, IsHi)); + if (IsHi) + InsertExtractIdx -= VecVT.getVectorElementCount().getKnownMinValue(); + } + return {SubRegIdx, InsertExtractIdx}; +} + void RISCVDAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we have already selected. if (Node->isMachineOpcode()) { @@ -704,56 +746,127 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { break; } case ISD::INSERT_SUBVECTOR: { - // Bail when not a "cast" like insert_subvector. - if (Node->getConstantOperandVal(2) != 0) - break; - if (!Node->getOperand(0).isUndef()) - break; + SDValue V = Node->getOperand(0); + SDValue SubV = Node->getOperand(1); + SDLoc DL(SubV); + auto Idx = Node->getConstantOperandVal(2); + MVT SubVecVT = Node->getOperand(1).getSimpleValueType(); - // Bail when normal isel should do the job. - MVT InVT = Node->getOperand(1).getSimpleValueType(); - if (VT.isFixedLengthVector() || InVT.isScalableVector()) - break; + // TODO: This method of selecting INSERT_SUBVECTOR should work + // with any type of insertion (fixed <-> scalable) but we don't yet + // correctly identify the canonical register class for fixed-length types. + // For now, keep the two paths separate. + if (VT.isScalableVector() && SubVecVT.isScalableVector()) { + bool IsFullVecReg = false; + switch (getLMUL(SubVecVT)) { + default: + break; + case RISCVVLMUL::LMUL_1: + case RISCVVLMUL::LMUL_2: + case RISCVVLMUL::LMUL_4: + case RISCVVLMUL::LMUL_8: + IsFullVecReg = true; + break; + } - unsigned RegClassID; - if (VT.getVectorElementType() == MVT::i1) - RegClassID = RISCV::VRRegClassID; - else - RegClassID = getRegClassIDForLMUL(getLMUL(VT)); + // If the subvector doesn't occupy a full vector register then we can't + // insert it purely using subregister manipulation. We must not clobber + // the untouched elements (say, in the upper half of the VR register). + if (!IsFullVecReg) + break; - SDValue V = Node->getOperand(1); - SDLoc DL(V); - SDValue RC = - CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT()); - SDNode *NewNode = - CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); - ReplaceNode(Node, NewNode); - return; + const auto *TRI = Subtarget->getRegisterInfo(); + unsigned SubRegIdx; + std::tie(SubRegIdx, Idx) = + decomposeSubvectorInsertExtractToSubRegs(VT, SubVecVT, Idx, TRI); + + // If the Idx hasn't been completely eliminated then this is a subvector + // extract which doesn't naturally align to a vector register. These must + // be handled using instructions to manipulate the vector registers. + if (Idx != 0) + break; + + SDNode *NewNode = CurDAG->getMachineNode( + TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV, + CurDAG->getTargetConstant(SubRegIdx, DL, Subtarget->getXLenVT())); + return ReplaceNode(Node, NewNode); + } + + if (VT.isScalableVector() && SubVecVT.isFixedLengthVector()) { + // Bail when not a "cast" like insert_subvector. + if (Idx != 0) + break; + if (!Node->getOperand(0).isUndef()) + break; + + unsigned RegClassID = getRegClassIDForVecVT(VT); + + SDValue RC = + CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT()); + SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + DL, VT, SubV, RC); + ReplaceNode(Node, NewNode); + return; + } + break; } case ISD::EXTRACT_SUBVECTOR: { - // Bail when not a "cast" like extract_subvector. - if (Node->getConstantOperandVal(1) != 0) - break; - - // Bail when normal isel can do the job. - MVT InVT = Node->getOperand(0).getSimpleValueType(); - if (VT.isScalableVector() || InVT.isFixedLengthVector()) - break; - - unsigned RegClassID; - if (InVT.getVectorElementType() == MVT::i1) - RegClassID = RISCV::VRRegClassID; - else - RegClassID = getRegClassIDForLMUL(getLMUL(InVT)); - SDValue V = Node->getOperand(0); + auto Idx = Node->getConstantOperandVal(1); + MVT InVT = Node->getOperand(0).getSimpleValueType(); SDLoc DL(V); - SDValue RC = - CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT()); - SDNode *NewNode = - CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); - ReplaceNode(Node, NewNode); - return; + + // TODO: This method of selecting EXTRACT_SUBVECTOR should work + // with any type of extraction (fixed <-> scalable) but we don't yet + // correctly identify the canonical register class for fixed-length types. + // For now, keep the two paths separate. + if (VT.isScalableVector() && InVT.isScalableVector()) { + const auto *TRI = Subtarget->getRegisterInfo(); + unsigned SubRegIdx; + std::tie(SubRegIdx, Idx) = + decomposeSubvectorInsertExtractToSubRegs(InVT, VT, Idx, TRI); + + // If the Idx hasn't been completely eliminated then this is a subvector + // extract which doesn't naturally align to a vector register. These must + // be handled using instructions to manipulate the vector registers. + if (Idx != 0) + break; + + // If we haven't set a SubRegIdx, then we must be going between LMUL<=1 + // types (VR -> VR). This can be done as a copy. + if (SubRegIdx == RISCV::NoSubRegister) { + unsigned RegClassID = getRegClassIDForVecVT(VT); + unsigned InRegClassID = getRegClassIDForVecVT(InVT); + assert(RegClassID == InRegClassID && + RegClassID == RISCV::VRRegClassID && + "Unexpected subvector extraction"); + SDValue RC = + CurDAG->getTargetConstant(InRegClassID, DL, Subtarget->getXLenVT()); + SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + DL, VT, V, RC); + return ReplaceNode(Node, NewNode); + } + SDNode *NewNode = CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, DL, VT, V, + CurDAG->getTargetConstant(SubRegIdx, DL, Subtarget->getXLenVT())); + return ReplaceNode(Node, NewNode); + } + + if (VT.isFixedLengthVector() && InVT.isScalableVector()) { + // Bail when not a "cast" like extract_subvector. + if (Idx != 0) + break; + + unsigned InRegClassID = getRegClassIDForVecVT(InVT); + + SDValue RC = + CurDAG->getTargetConstant(InRegClassID, DL, Subtarget->getXLenVT()); + SDNode *NewNode = + CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC); + ReplaceNode(Node, NewNode); + return; + } + break; } } diff --git a/test/CodeGen/RISCV/rvv/extract-subvector.ll b/test/CodeGen/RISCV/rvv/extract-subvector.ll new file mode 100644 index 00000000000..c14abab5440 --- /dev/null +++ b/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -0,0 +1,226 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s + +define @extract_nxv8i32_nxv4i32_0( %vec) { +; CHECK-LABEL: extract_nxv8i32_nxv4i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m4 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv4i32.nxv8i32( %vec, i64 0) + ret %c +} + +define @extract_nxv8i32_nxv4i32_4( %vec) { +; CHECK-LABEL: extract_nxv8i32_nxv4i32_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv4i32.nxv8i32( %vec, i64 4) + ret %c +} + +define @extract_nxv8i32_nxv2i32_0( %vec) { +; CHECK-LABEL: extract_nxv8i32_nxv2i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8m4 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv8i32( %vec, i64 0) + ret %c +} + +define @extract_nxv8i32_nxv2i32_2( %vec) { +; CHECK-LABEL: extract_nxv8i32_nxv2i32_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv8i32( %vec, i64 2) + ret %c +} + +define @extract_nxv8i32_nxv2i32_4( %vec) { +; CHECK-LABEL: extract_nxv8i32_nxv2i32_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv8i32( %vec, i64 4) + ret %c +} + +define @extract_nxv8i32_nxv2i32_6( %vec) { +; CHECK-LABEL: extract_nxv8i32_nxv2i32_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv8i32( %vec, i64 6) + ret %c +} + +define @extract_nxv16i32_nxv8i32_0( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv8i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v8m4 killed $v8m4 killed $v8m8 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv8i32.nxv16i32( %vec, i64 0) + ret %c +} + +define @extract_nxv16i32_nxv8i32_8( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv8i32_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv8i32.nxv16i32( %vec, i64 8) + ret %c +} + +define @extract_nxv16i32_nxv4i32_0( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv4i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m8 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv4i32.nxv16i32( %vec, i64 0) + ret %c +} + +define @extract_nxv16i32_nxv4i32_4( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv4i32_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv4i32.nxv16i32( %vec, i64 4) + ret %c +} + +define @extract_nxv16i32_nxv4i32_8( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv4i32_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv4i32.nxv16i32( %vec, i64 8) + ret %c +} + +define @extract_nxv16i32_nxv4i32_12( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv4i32_12: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v8, v14 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv4i32.nxv16i32( %vec, i64 12) + ret %c +} + +define @extract_nxv16i32_nxv2i32_0( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv2i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8m8 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv16i32( %vec, i64 0) + ret %c +} + +define @extract_nxv16i32_nxv2i32_2( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv2i32_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv16i32( %vec, i64 2) + ret %c +} + +define @extract_nxv16i32_nxv2i32_4( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv2i32_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv16i32( %vec, i64 4) + ret %c +} + +define @extract_nxv16i32_nxv2i32_6( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv2i32_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv16i32( %vec, i64 6) + ret %c +} + +define @extract_nxv16i32_nxv2i32_8( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv2i32_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv16i32( %vec, i64 8) + ret %c +} + +define @extract_nxv16i32_nxv2i32_10( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv2i32_10: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v13 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv16i32( %vec, i64 10) + ret %c +} + +define @extract_nxv16i32_nxv2i32_12( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv2i32_12: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v14 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv16i32( %vec, i64 12) + ret %c +} + +define @extract_nxv16i32_nxv2i32_14( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv2i32_14: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v15 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv2i32.nxv16i32( %vec, i64 14) + ret %c +} + +define @extract_nxv16i32_nxv1i32_0( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv1i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8m8 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv1i32.nxv16i32( %vec, i64 0) + ret %c +} + +; TODO: Extracts that don't align to a vector register are not yet supported. +; In this case we want to extract the upper half of the lowest VR subregister +; in the LMUL group. +; define @extract_nxv16i32_nxv1i32_1( %vec) { +; %c = call @llvm.experimental.vector.extract.nxv1i32.nxv16i32( %vec, i64 1) +; ret %c +; } + +define @extract_nxv16i32_nxv1i32_2( %vec) { +; CHECK-LABEL: extract_nxv16i32_nxv1i32_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv1i32.nxv16i32( %vec, i64 2) + ret %c +} + +define @extract_nxv2i32_nxv1i32_0( %vec) { +; CHECK-LABEL: extract_nxv2i32_nxv1i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv1i32.nxv2i32( %vec, i64 0) + ret %c +} + +declare @llvm.experimental.vector.extract.nxv1i32.nxv2i32( %vec, i64 %idx) + +declare @llvm.experimental.vector.extract.nxv2i32.nxv8i32( %vec, i64 %idx) +declare @llvm.experimental.vector.extract.nxv4i32.nxv8i32( %vec, i64 %idx) + +declare @llvm.experimental.vector.extract.nxv1i32.nxv16i32( %vec, i64 %idx) +declare @llvm.experimental.vector.extract.nxv2i32.nxv16i32( %vec, i64 %idx) +declare @llvm.experimental.vector.extract.nxv4i32.nxv16i32( %vec, i64 %idx) +declare @llvm.experimental.vector.extract.nxv8i32.nxv16i32( %vec, i64 %idx) diff --git a/test/CodeGen/RISCV/rvv/insert-subvector.ll b/test/CodeGen/RISCV/rvv/insert-subvector.ll new file mode 100644 index 00000000000..6538ec5dd06 --- /dev/null +++ b/test/CodeGen/RISCV/rvv/insert-subvector.ll @@ -0,0 +1,206 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s + +define @insert_nxv8i32_nxv4i32_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv8i32_nxv4i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv4i32.nxv8i32( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv8i32_nxv4i32_4( %vec, %subvec) { +; CHECK-LABEL: insert_nxv8i32_nxv4i32_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v10, v12 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv4i32.nxv8i32( %vec, %subvec, i64 4) + ret %v +} + +define @insert_nxv8i32_nxv2i32_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv8i32_nxv2i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv8i32( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv8i32_nxv2i32_2( %vec, %subvec) { +; CHECK-LABEL: insert_nxv8i32_nxv2i32_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v12 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv8i32( %vec, %subvec, i64 2) + ret %v +} + +define @insert_nxv8i32_nxv2i32_4( %vec, %subvec) { +; CHECK-LABEL: insert_nxv8i32_nxv2i32_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v12 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv8i32( %vec, %subvec, i64 4) + ret %v +} + +define @insert_nxv8i32_nxv2i32_6( %vec, %subvec) { +; CHECK-LABEL: insert_nxv8i32_nxv2i32_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v11, v12 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv8i32( %vec, %subvec, i64 6) + ret %v +} + +define @insert_nxv16i32_nxv8i32_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv8i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv8i32.nxv16i32( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv16i32_nxv8i32_8( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv8i32_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv4r.v v12, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv8i32.nxv16i32( %vec, %subvec, i64 8) + ret %v +} + +define @insert_nxv16i32_nxv4i32_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv4i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv4i32.nxv16i32( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv16i32_nxv4i32_4( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv4i32_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v10, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv4i32.nxv16i32( %vec, %subvec, i64 4) + ret %v +} + +define @insert_nxv16i32_nxv4i32_8( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv4i32_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv4i32.nxv16i32( %vec, %subvec, i64 8) + ret %v +} + +define @insert_nxv16i32_nxv4i32_12( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv4i32_12: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v14, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv4i32.nxv16i32( %vec, %subvec, i64 12) + ret %v +} + +define @insert_nxv16i32_nxv2i32_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv2i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv16i32( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv16i32_nxv2i32_2( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv2i32_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv16i32( %vec, %subvec, i64 2) + ret %v +} + +define @insert_nxv16i32_nxv2i32_4( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv2i32_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv16i32( %vec, %subvec, i64 4) + ret %v +} + +define @insert_nxv16i32_nxv2i32_6( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv2i32_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v11, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv16i32( %vec, %subvec, i64 6) + ret %v +} + +define @insert_nxv16i32_nxv2i32_8( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv2i32_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv16i32( %vec, %subvec, i64 8) + ret %v +} + +define @insert_nxv16i32_nxv2i32_10( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv2i32_10: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v13, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv16i32( %vec, %subvec, i64 10) + ret %v +} + +define @insert_nxv16i32_nxv2i32_12( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv2i32_12: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v14, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv16i32( %vec, %subvec, i64 12) + ret %v +} + +define @insert_nxv16i32_nxv2i32_14( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv2i32_14: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v15, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2i32.nxv16i32( %vec, %subvec, i64 14) + ret %v +} + +; TODO: Inserts that are less than LMUL=1 are not yet supported. In this case +; we need mask out the unaffected elements (top half of the VR %subvec +; register) +;define @insert_nxv16i32_nxv1i32_0( %vec, %subvec) { +; %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 0) +; ret %v +;} + +; TODO: Inserts that don't align to a vector register are not yet supported. +; In this case we want to insert the subvector into the upper half of the +; lowest VR subregister in the LMUL group. +;define @insert_nxv16i32_nxv1i32_1( %vec, %subvec) { +; %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 1) +; ret %v +;} + +declare @llvm.experimental.vector.insert.nxv2i32.nxv8i32(, , i64 %idx) +declare @llvm.experimental.vector.insert.nxv4i32.nxv8i32(, , i64 %idx) + +declare @llvm.experimental.vector.insert.nxv1i32.nxv16i32(, , i64 %idx) +declare @llvm.experimental.vector.insert.nxv2i32.nxv16i32(, , i64 %idx) +declare @llvm.experimental.vector.insert.nxv4i32.nxv16i32(, , i64 %idx) +declare @llvm.experimental.vector.insert.nxv8i32.nxv16i32(, , i64 %idx)