diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp index 5c9f51a3924..9272cf692dc 100644 --- a/lib/Target/BPF/BPFISelLowering.cpp +++ b/lib/Target/BPF/BPFISelLowering.cpp @@ -33,6 +33,10 @@ using namespace llvm; #define DEBUG_TYPE "bpf-lower" +static cl::opt BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order", + cl::Hidden, cl::init(false), + cl::desc("Expand memcpy into load/store pairs in order")); + static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg) { MachineFunction &MF = DAG.getMachineFunction(); DAG.getContext()->diagnose( @@ -132,10 +136,30 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM, setMinFunctionAlignment(3); setPrefFunctionAlignment(3); - // inline memcpy() for kernel to see explicit copy - MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128; - MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128; - MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128; + if (BPFExpandMemcpyInOrder) { + // LLVM generic code will try to expand memcpy into load/store pairs at this + // stage which is before quite a few IR optimization passes, therefore the + // loads and stores could potentially be moved apart from each other which + // will cause trouble to memcpy pattern matcher inside kernel eBPF JIT + // compilers. + // + // When -bpf-expand-memcpy-in-order specified, we want to defer the expand + // of memcpy to later stage in IR optimization pipeline so those load/store + // pairs won't be touched and could be kept in order. Hence, we set + // MaxStoresPerMem* to zero to disable the generic getMemcpyLoadsAndStores + // code path, and ask LLVM to use target expander EmitTargetCodeForMemcpy. + MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 0; + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 0; + MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 0; + } else { + // inline memcpy() for kernel to see explicit copy + unsigned CommonMaxStores = + STI.getSelectionDAGInfo()->getCommonMaxStoresPerMemFunc(); + + MaxStoresPerMemset = MaxStoresPerMemsetOptSize = CommonMaxStores; + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = CommonMaxStores; + MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = CommonMaxStores; + } // CPU/Feature control HasAlu32 = STI.getHasAlu32(); @@ -518,6 +542,8 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const { return "BPFISD::BR_CC"; case BPFISD::Wrapper: return "BPFISD::Wrapper"; + case BPFISD::MEMCPY: + return "BPFISD::MEMCPY"; } return nullptr; } @@ -556,6 +582,37 @@ BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, return PromotedReg2; } +MachineBasicBlock * +BPFTargetLowering::EmitInstrWithCustomInserterMemcpy(MachineInstr &MI, + MachineBasicBlock *BB) + const { + MachineFunction *MF = MI.getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineInstrBuilder MIB(*MF, MI); + unsigned ScratchReg; + + // This function does custom insertion during lowering BPFISD::MEMCPY which + // only has two register operands from memcpy semantics, the copy source + // address and the copy destination address. + // + // Because we will expand BPFISD::MEMCPY into load/store pairs, we will need + // a third scratch register to serve as the destination register of load and + // source register of store. + // + // The scratch register here is with the Define | Dead | EarlyClobber flags. + // The EarlyClobber flag has the semantic property that the operand it is + // attached to is clobbered before the rest of the inputs are read. Hence it + // must be unique among the operands to the instruction. The Define flag is + // needed to coerce the machine verifier that an Undef value isn't a problem + // as we anyway is loading memory into it. The Dead flag is needed as the + // value in scratch isn't supposed to be used by any other instruction. + ScratchReg = MRI.createVirtualRegister(&BPF::GPRRegClass); + MIB.addReg(ScratchReg, + RegState::Define | RegState::Dead | RegState::EarlyClobber); + + return BB; +} + MachineBasicBlock * BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -567,6 +624,8 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, Opc == BPF::Select_32 || Opc == BPF::Select_32_64); + bool isMemcpyOp = Opc == BPF::MEMCPY; + #ifndef NDEBUG bool isSelectRIOp = (Opc == BPF::Select_Ri || Opc == BPF::Select_Ri_64_32 || @@ -574,9 +633,13 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, Opc == BPF::Select_Ri_32_64); - assert((isSelectRROp || isSelectRIOp) && "Unexpected instr type to insert"); + assert((isSelectRROp || isSelectRIOp || isMemcpyOp) && + "Unexpected instr type to insert"); #endif + if (isMemcpyOp) + return EmitInstrWithCustomInserterMemcpy(MI, BB); + bool is32BitCmp = (Opc == BPF::Select_32 || Opc == BPF::Select_32_64 || Opc == BPF::Select_Ri_32 || diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h index 3eb099cf369..0aa8b9ac57a 100644 --- a/lib/Target/BPF/BPFISelLowering.h +++ b/lib/Target/BPF/BPFISelLowering.h @@ -28,7 +28,8 @@ enum NodeType : unsigned { CALL, SELECT_CC, BR_CC, - Wrapper + Wrapper, + MEMCPY }; } @@ -110,6 +111,11 @@ private: unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg, bool isSigned) const; + + MachineBasicBlock * EmitInstrWithCustomInserterMemcpy(MachineInstr &MI, + MachineBasicBlock *BB) + const; + }; } diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp index ab6d7c84cd1..d453a7d1d1f 100644 --- a/lib/Target/BPF/BPFInstrInfo.cpp +++ b/lib/Target/BPF/BPFInstrInfo.cpp @@ -43,6 +43,83 @@ void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB, llvm_unreachable("Impossible reg-to-reg copy"); } +void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const { + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + uint64_t CopyLen = MI->getOperand(2).getImm(); + uint64_t Alignment = MI->getOperand(3).getImm(); + unsigned ScratchReg = MI->getOperand(4).getReg(); + MachineBasicBlock *BB = MI->getParent(); + DebugLoc dl = MI->getDebugLoc(); + unsigned LdOpc, StOpc; + + switch (Alignment) { + case 1: + LdOpc = BPF::LDB; + StOpc = BPF::STB; + break; + case 2: + LdOpc = BPF::LDH; + StOpc = BPF::STH; + break; + case 4: + LdOpc = BPF::LDW; + StOpc = BPF::STW; + break; + case 8: + LdOpc = BPF::LDD; + StOpc = BPF::STD; + break; + default: + llvm_unreachable("unsupported memcpy alignment"); + } + + unsigned IterationNum = CopyLen >> Log2_64(Alignment); + for(unsigned I = 0; I < IterationNum; ++I) { + BuildMI(*BB, MI, dl, get(LdOpc)) + .addReg(ScratchReg).addReg(SrcReg).addImm(I * Alignment); + BuildMI(*BB, MI, dl, get(StOpc)) + .addReg(ScratchReg).addReg(DstReg).addImm(I * Alignment); + } + + unsigned BytesLeft = CopyLen & (Alignment - 1); + unsigned Offset = IterationNum * Alignment; + bool Hanging4Byte = BytesLeft & 0x4; + bool Hanging2Byte = BytesLeft & 0x2; + bool Hanging1Byte = BytesLeft & 0x1; + if (Hanging4Byte) { + BuildMI(*BB, MI, dl, get(BPF::LDW)) + .addReg(ScratchReg).addReg(SrcReg).addImm(Offset); + BuildMI(*BB, MI, dl, get(BPF::STW)) + .addReg(ScratchReg).addReg(DstReg).addImm(Offset); + Offset += 4; + } + if (Hanging2Byte) { + BuildMI(*BB, MI, dl, get(BPF::LDH)) + .addReg(ScratchReg).addReg(SrcReg).addImm(Offset); + BuildMI(*BB, MI, dl, get(BPF::STH)) + .addReg(ScratchReg).addReg(DstReg).addImm(Offset); + Offset += 2; + } + if (Hanging1Byte) { + BuildMI(*BB, MI, dl, get(BPF::LDB)) + .addReg(ScratchReg).addReg(SrcReg).addImm(Offset); + BuildMI(*BB, MI, dl, get(BPF::STB)) + .addReg(ScratchReg).addReg(DstReg).addImm(Offset); + } + + BB->erase(MI); +} + +bool BPFInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + if (MI.getOpcode() == BPF::MEMCPY) { + expandMEMCPY(MI); + return true; + } + + return false; +} + void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned SrcReg, bool IsKill, int FI, diff --git a/lib/Target/BPF/BPFInstrInfo.h b/lib/Target/BPF/BPFInstrInfo.h index f591f48a89a..fb65a86a6d1 100644 --- a/lib/Target/BPF/BPFInstrInfo.h +++ b/lib/Target/BPF/BPFInstrInfo.h @@ -34,6 +34,8 @@ public: const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; + bool expandPostRAPseudo(MachineInstr &MI) const override; + void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, bool isKill, int FrameIndex, @@ -55,6 +57,9 @@ public: MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded = nullptr) const override; +private: + void expandMEMCPY(MachineBasicBlock::iterator) const; + }; } diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td index dc4fdc571ab..aaef5fb706e 100644 --- a/lib/Target/BPF/BPFInstrInfo.td +++ b/lib/Target/BPF/BPFInstrInfo.td @@ -28,6 +28,10 @@ def SDT_BPFBrCC : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>, SDTCisVT<3, OtherVT>]>; def SDT_BPFWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; +def SDT_BPFMEMCPY : SDTypeProfile<0, 4, [SDTCisVT<0, i64>, + SDTCisVT<1, i64>, + SDTCisVT<2, i64>, + SDTCisVT<3, i64>]>; def BPFcall : SDNode<"BPFISD::CALL", SDT_BPFCall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, @@ -43,6 +47,9 @@ def BPFbrcc : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC, def BPFselectcc : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>; def BPFWrapper : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>; +def BPFmemcpy : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad]>; def BPFIsLittleEndian : Predicate<"CurDAG->getDataLayout().isLittleEndian()">; def BPFIsBigEndian : Predicate<"!CurDAG->getDataLayout().isLittleEndian()">; def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">; @@ -714,3 +721,11 @@ let Predicates = [BPFHasALU32] in { def : Pat<(i64 (extloadi32 ADDRri:$src)), (SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>; } + +let usesCustomInserter = 1, isCodeGenOnly = 1 in { + def MEMCPY : Pseudo< + (outs), + (ins GPR:$dst, GPR:$src, i64imm:$len, i64imm:$align, variable_ops), + "#memcpy dst: $dst, src: $src, len: $len, align: $align", + [(BPFmemcpy GPR:$dst, GPR:$src, imm:$len, imm:$align)]>; +} diff --git a/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/lib/Target/BPF/BPFSelectionDAGInfo.cpp new file mode 100644 index 00000000000..24d5f59bbfd --- /dev/null +++ b/lib/Target/BPF/BPFSelectionDAGInfo.cpp @@ -0,0 +1,43 @@ +//===-- BPFSelectionDAGInfo.cpp - BPF SelectionDAG Info -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the BPFSelectionDAGInfo class. +// +//===----------------------------------------------------------------------===// + +#include "BPFTargetMachine.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/DerivedTypes.h" +using namespace llvm; + +#define DEBUG_TYPE "bpf-selectiondag-info" + +SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { + // Requires the copy size to be a constant. + ConstantSDNode *ConstantSize = dyn_cast(Size); + if (!ConstantSize) + return SDValue(); + + unsigned CopyLen = ConstantSize->getZExtValue(); + unsigned StoresNumEstimate = alignTo(CopyLen, Align) >> Log2_32(Align); + // Impose the same copy length limit as MaxStoresPerMemcpy. + if (StoresNumEstimate > getCommonMaxStoresPerMemFunc()) + return SDValue(); + + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + + Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src, + DAG.getConstant(CopyLen, dl, MVT::i64), + DAG.getConstant(Align, dl, MVT::i64)); + + return Dst.getValue(0); +} diff --git a/lib/Target/BPF/BPFSelectionDAGInfo.h b/lib/Target/BPF/BPFSelectionDAGInfo.h new file mode 100644 index 00000000000..19d3c576957 --- /dev/null +++ b/lib/Target/BPF/BPFSelectionDAGInfo.h @@ -0,0 +1,36 @@ +//===-- BPFSelectionDAGInfo.h - BPF SelectionDAG Info -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the BPF subclass for SelectionDAGTargetInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H +#define LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H + +#include "llvm/CodeGen/SelectionDAGTargetInfo.h" + +namespace llvm { + +class BPFSelectionDAGInfo : public SelectionDAGTargetInfo { +public: + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, + bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; + + unsigned getCommonMaxStoresPerMemFunc() const { return 128; } + +}; + +} + +#endif diff --git a/lib/Target/BPF/BPFSubtarget.h b/lib/Target/BPF/BPFSubtarget.h index 067b22ad3ce..60e56435fe4 100644 --- a/lib/Target/BPF/BPFSubtarget.h +++ b/lib/Target/BPF/BPFSubtarget.h @@ -17,6 +17,7 @@ #include "BPFFrameLowering.h" #include "BPFISelLowering.h" #include "BPFInstrInfo.h" +#include "BPFSelectionDAGInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" @@ -33,7 +34,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo { BPFInstrInfo InstrInfo; BPFFrameLowering FrameLowering; BPFTargetLowering TLInfo; - SelectionDAGTargetInfo TSInfo; + BPFSelectionDAGInfo TSInfo; private: void initializeEnvironment(); @@ -75,7 +76,7 @@ public: const BPFTargetLowering *getTargetLowering() const override { return &TLInfo; } - const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { + const BPFSelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } const TargetRegisterInfo *getRegisterInfo() const override { diff --git a/lib/Target/BPF/CMakeLists.txt b/lib/Target/BPF/CMakeLists.txt index 7e53c6c4396..ee01b4b7b80 100644 --- a/lib/Target/BPF/CMakeLists.txt +++ b/lib/Target/BPF/CMakeLists.txt @@ -20,6 +20,7 @@ add_llvm_target(BPFCodeGen BPFISelLowering.cpp BPFMCInstLower.cpp BPFRegisterInfo.cpp + BPFSelectionDAGInfo.cpp BPFSubtarget.cpp BPFTargetMachine.cpp BPFMIPeephole.cpp diff --git a/test/CodeGen/BPF/memcpy-expand-in-order.ll b/test/CodeGen/BPF/memcpy-expand-in-order.ll new file mode 100644 index 00000000000..6ee31264c76 --- /dev/null +++ b/test/CodeGen/BPF/memcpy-expand-in-order.ll @@ -0,0 +1,116 @@ +; RUN: llc < %s -march=bpfel -bpf-expand-memcpy-in-order | FileCheck %s +; RUN: llc < %s -march=bpfeb -bpf-expand-memcpy-in-order | FileCheck %s +; +; #define COPY_LEN 9 +; +; void cal_align1(void *a, void *b) +; { +; __builtin_memcpy(a, b, COPY_LEN); +; } +; +; void cal_align2(short *a, short *b) +; { +; __builtin_memcpy(a, b, COPY_LEN); +; } +; +; #undef COPY_LEN +; #define COPY_LEN 19 +; void cal_align4(int *a, int *b) +; { +; __builtin_memcpy(a, b, COPY_LEN); +; } +; +; #undef COPY_LEN +; #define COPY_LEN 27 +; void cal_align8(long long *a, long long *b) +; { +; __builtin_memcpy(a, b, COPY_LEN); +; } + +; Function Attrs: nounwind +define dso_local void @cal_align1(i8* nocapture %a, i8* nocapture readonly %b) local_unnamed_addr #0 { +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 9, i1 false) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1 + +; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u8 *)([[SRC_REG:r[0-9]]] + 0) +; CHECK: *(u8 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 1) +; CHECK: *(u8 *)([[DST_REG]] + 1) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 2) +; CHECK: *(u8 *)([[DST_REG]] + 2) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 3) +; CHECK: *(u8 *)([[DST_REG]] + 3) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 4) +; CHECK: *(u8 *)([[DST_REG]] + 4) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 5) +; CHECK: *(u8 *)([[DST_REG]] + 5) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 6) +; CHECK: *(u8 *)([[DST_REG]] + 6) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 7) +; CHECK: *(u8 *)([[DST_REG]] + 7) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 8) +; CHECK: *(u8 *)([[DST_REG]] + 8) = [[SCRATCH_REG]] + +; Function Attrs: nounwind +define dso_local void @cal_align2(i16* nocapture %a, i16* nocapture readonly %b) local_unnamed_addr #0 { +entry: + %0 = bitcast i16* %a to i8* + %1 = bitcast i16* %b to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 %0, i8* align 2 %1, i64 9, i1 false) + ret void +} +; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u16 *)([[SRC_REG:r[0-9]]] + 0) +; CHECK: *(u16 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 2) +; CHECK: *(u16 *)([[DST_REG]] + 2) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 4) +; CHECK: *(u16 *)([[DST_REG]] + 4) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 6) +; CHECK: *(u16 *)([[DST_REG]] + 6) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 8) +; CHECK: *(u8 *)([[DST_REG]] + 8) = [[SCRATCH_REG]] + +; Function Attrs: nounwind +define dso_local void @cal_align4(i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %a to i8* + %1 = bitcast i32* %b to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 19, i1 false) + ret void +} +; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u32 *)([[SRC_REG:r[0-9]]] + 0) +; CHECK: *(u32 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 4) +; CHECK: *(u32 *)([[DST_REG]] + 4) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 8) +; CHECK: *(u32 *)([[DST_REG]] + 8) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 12) +; CHECK: *(u32 *)([[DST_REG]] + 12) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 16) +; CHECK: *(u16 *)([[DST_REG]] + 16) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 18) +; CHECK: *(u8 *)([[DST_REG]] + 18) = [[SCRATCH_REG]] + +; Function Attrs: nounwind +define dso_local void @cal_align8(i64* nocapture %a, i64* nocapture readonly %b) local_unnamed_addr #0 { +entry: + %0 = bitcast i64* %a to i8* + %1 = bitcast i64* %b to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 27, i1 false) + ret void +} +; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u64 *)([[SRC_REG:r[0-9]]] + 0) +; CHECK: *(u64 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u64 *)([[SRC_REG]] + 8) +; CHECK: *(u64 *)([[DST_REG]] + 8) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u64 *)([[SRC_REG]] + 16) +; CHECK: *(u64 *)([[DST_REG]] + 16) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 24) +; CHECK: *(u16 *)([[DST_REG]] + 24) = [[SCRATCH_REG]] +; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 26) +; CHECK: *(u8 *)([[DST_REG]] + 26) = [[SCRATCH_REG]]