diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index 5c9f51a3924..9272cf692dc 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -33,6 +33,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "bpf-lower"
 
+static cl::opt<bool> BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order",
+  cl::Hidden, cl::init(false),
+  cl::desc("Expand memcpy into load/store pairs in order"));
+
 static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg) {
   MachineFunction &MF = DAG.getMachineFunction();
   DAG.getContext()->diagnose(
@@ -132,10 +136,30 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   setMinFunctionAlignment(3);
   setPrefFunctionAlignment(3);
 
-  // inline memcpy() for kernel to see explicit copy
-  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128;
-  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
+  if (BPFExpandMemcpyInOrder) {
+    // LLVM generic code will try to expand memcpy into load/store pairs at this
+    // stage which is before quite a few IR optimization passes, therefore the
+    // loads and stores could potentially be moved apart from each other which
+    // will cause trouble to memcpy pattern matcher inside kernel eBPF JIT
+    // compilers.
+    //
+    // When -bpf-expand-memcpy-in-order specified, we want to defer the expand
+    // of memcpy to later stage in IR optimization pipeline so those load/store
+    // pairs won't be touched and could be kept in order. Hence, we set
+    // MaxStoresPerMem* to zero to disable the generic getMemcpyLoadsAndStores
+    // code path, and ask LLVM to use target expander EmitTargetCodeForMemcpy.
+    MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 0;
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 0;
+    MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 0;
+  } else {
+    // inline memcpy() for kernel to see explicit copy
+    unsigned CommonMaxStores =
+      STI.getSelectionDAGInfo()->getCommonMaxStoresPerMemFunc();
+
+    MaxStoresPerMemset = MaxStoresPerMemsetOptSize = CommonMaxStores;
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = CommonMaxStores;
+    MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = CommonMaxStores;
+  }
 
   // CPU/Feature control
   HasAlu32 = STI.getHasAlu32();
@@ -518,6 +542,8 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "BPFISD::BR_CC";
   case BPFISD::Wrapper:
     return "BPFISD::Wrapper";
+  case BPFISD::MEMCPY:
+    return "BPFISD::MEMCPY";
   }
   return nullptr;
 }
@@ -556,6 +582,37 @@ BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
   return PromotedReg2;
 }
 
+MachineBasicBlock *
+BPFTargetLowering::EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+                                                     MachineBasicBlock *BB)
+                                                     const {
+  MachineFunction *MF = MI.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB(*MF, MI);
+  unsigned ScratchReg;
+
+  // This function does custom insertion during lowering BPFISD::MEMCPY which
+  // only has two register operands from memcpy semantics, the copy source
+  // address and the copy destination address.
+  //
+  // Because we will expand BPFISD::MEMCPY into load/store pairs, we will need
+  // a third scratch register to serve as the destination register of load and
+  // source register of store.
+  //
+  // The scratch register here is with the Define | Dead | EarlyClobber flags.
+  // The EarlyClobber flag has the semantic property that the operand it is
+  // attached to is clobbered before the rest of the inputs are read. Hence it
+  // must be unique among the operands to the instruction. The Define flag is
+  // needed to coerce the machine verifier that an Undef value isn't a problem
+  // as we anyway is loading memory into it. The Dead flag is needed as the
+  // value in scratch isn't supposed to be used by any other instruction.
+  ScratchReg = MRI.createVirtualRegister(&BPF::GPRRegClass);
+  MIB.addReg(ScratchReg,
+             RegState::Define | RegState::Dead | RegState::EarlyClobber);
+
+  return BB;
+}
+
 MachineBasicBlock *
 BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -567,6 +624,8 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                        Opc == BPF::Select_32 ||
                        Opc == BPF::Select_32_64);
 
+  bool isMemcpyOp = Opc == BPF::MEMCPY;
+
 #ifndef NDEBUG
   bool isSelectRIOp = (Opc == BPF::Select_Ri ||
                        Opc == BPF::Select_Ri_64_32 ||
@@ -574,9 +633,13 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                        Opc == BPF::Select_Ri_32_64);
 
 
-  assert((isSelectRROp || isSelectRIOp) && "Unexpected instr type to insert");
+  assert((isSelectRROp || isSelectRIOp || isMemcpyOp) &&
+         "Unexpected instr type to insert");
 #endif
 
+  if (isMemcpyOp)
+    return EmitInstrWithCustomInserterMemcpy(MI, BB);
+
   bool is32BitCmp = (Opc == BPF::Select_32 ||
                      Opc == BPF::Select_32_64 ||
                      Opc == BPF::Select_Ri_32 ||
diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h
index 3eb099cf369..0aa8b9ac57a 100644
--- a/lib/Target/BPF/BPFISelLowering.h
+++ b/lib/Target/BPF/BPFISelLowering.h
@@ -28,7 +28,8 @@ enum NodeType : unsigned {
   CALL,
   SELECT_CC,
   BR_CC,
-  Wrapper
+  Wrapper,
+  MEMCPY
 };
 }
 
@@ -110,6 +111,11 @@ private:
 
   unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg,
                          bool isSigned) const;
+
+  MachineBasicBlock * EmitInstrWithCustomInserterMemcpy(MachineInstr &MI,
+                                                        MachineBasicBlock *BB)
+                                                        const;
+
 };
 }
 
diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp
index ab6d7c84cd1..d453a7d1d1f 100644
--- a/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/lib/Target/BPF/BPFInstrInfo.cpp
@@ -43,6 +43,83 @@ void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     llvm_unreachable("Impossible reg-to-reg copy");
 }
 
+void BPFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  uint64_t CopyLen = MI->getOperand(2).getImm();
+  uint64_t Alignment = MI->getOperand(3).getImm();
+  unsigned ScratchReg = MI->getOperand(4).getReg();
+  MachineBasicBlock *BB = MI->getParent();
+  DebugLoc dl = MI->getDebugLoc();
+  unsigned LdOpc, StOpc;
+
+  switch (Alignment) {
+  case 1:
+    LdOpc = BPF::LDB;
+    StOpc = BPF::STB;
+    break;
+  case 2:
+    LdOpc = BPF::LDH;
+    StOpc = BPF::STH;
+    break;
+  case 4:
+    LdOpc = BPF::LDW;
+    StOpc = BPF::STW;
+    break;
+  case 8:
+    LdOpc = BPF::LDD;
+    StOpc = BPF::STD;
+    break;
+  default:
+    llvm_unreachable("unsupported memcpy alignment");
+  }
+
+  unsigned IterationNum = CopyLen >> Log2_64(Alignment);
+  for(unsigned I = 0; I < IterationNum; ++I) {
+    BuildMI(*BB, MI, dl, get(LdOpc))
+            .addReg(ScratchReg).addReg(SrcReg).addImm(I * Alignment);
+    BuildMI(*BB, MI, dl, get(StOpc))
+            .addReg(ScratchReg).addReg(DstReg).addImm(I * Alignment);
+  }
+
+  unsigned BytesLeft = CopyLen & (Alignment - 1);
+  unsigned Offset = IterationNum * Alignment;
+  bool Hanging4Byte = BytesLeft & 0x4;
+  bool Hanging2Byte = BytesLeft & 0x2;
+  bool Hanging1Byte = BytesLeft & 0x1;
+  if (Hanging4Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDW))
+            .addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STW))
+            .addReg(ScratchReg).addReg(DstReg).addImm(Offset);
+    Offset += 4;
+  }
+  if (Hanging2Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDH))
+            .addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STH))
+            .addReg(ScratchReg).addReg(DstReg).addImm(Offset);
+    Offset += 2;
+  }
+  if (Hanging1Byte) {
+    BuildMI(*BB, MI, dl, get(BPF::LDB))
+            .addReg(ScratchReg).addReg(SrcReg).addImm(Offset);
+    BuildMI(*BB, MI, dl, get(BPF::STB))
+            .addReg(ScratchReg).addReg(DstReg).addImm(Offset);
+  }
+
+  BB->erase(MI);
+}
+
+bool BPFInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() == BPF::MEMCPY) {
+    expandMEMCPY(MI);
+    return true;
+  }
+
+  return false;
+}
+
 void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
                                        unsigned SrcReg, bool IsKill, int FI,
diff --git a/lib/Target/BPF/BPFInstrInfo.h b/lib/Target/BPF/BPFInstrInfo.h
index f591f48a89a..fb65a86a6d1 100644
--- a/lib/Target/BPF/BPFInstrInfo.h
+++ b/lib/Target/BPF/BPFInstrInfo.h
@@ -34,6 +34,8 @@ public:
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, unsigned SrcReg,
                            bool isKill, int FrameIndex,
@@ -55,6 +57,9 @@ public:
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+private:
+  void expandMEMCPY(MachineBasicBlock::iterator) const;
+
 };
 }
 
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index dc4fdc571ab..aaef5fb706e 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -28,6 +28,10 @@ def SDT_BPFBrCC         : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
                                                SDTCisVT<3, OtherVT>]>;
 def SDT_BPFWrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                SDTCisPtrTy<0>]>;
+def SDT_BPFMEMCPY       : SDTypeProfile<0, 4, [SDTCisVT<0, i64>,
+                                               SDTCisVT<1, i64>,
+                                               SDTCisVT<2, i64>,
+                                               SDTCisVT<3, i64>]>;
 
 def BPFcall         : SDNode<"BPFISD::CALL", SDT_BPFCall,
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@@ -43,6 +47,9 @@ def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
 
 def BPFselectcc     : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>;
 def BPFWrapper      : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
+def BPFmemcpy       : SDNode<"BPFISD::MEMCPY", SDT_BPFMEMCPY,
+                             [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                              SDNPMayStore, SDNPMayLoad]>;
 def BPFIsLittleEndian : Predicate<"CurDAG->getDataLayout().isLittleEndian()">;
 def BPFIsBigEndian    : Predicate<"!CurDAG->getDataLayout().isLittleEndian()">;
 def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
@@ -714,3 +721,11 @@ let Predicates = [BPFHasALU32] in {
   def : Pat<(i64 (extloadi32 ADDRri:$src)),
             (SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
 }
+
+let usesCustomInserter = 1, isCodeGenOnly = 1 in {
+    def MEMCPY : Pseudo<
+      (outs),
+      (ins GPR:$dst, GPR:$src, i64imm:$len, i64imm:$align, variable_ops),
+      "#memcpy dst: $dst, src: $src, len: $len, align: $align",
+      [(BPFmemcpy GPR:$dst, GPR:$src, imm:$len, imm:$align)]>;
+}
diff --git a/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/lib/Target/BPF/BPFSelectionDAGInfo.cpp
new file mode 100644
index 00000000000..24d5f59bbfd
--- /dev/null
+++ b/lib/Target/BPF/BPFSelectionDAGInfo.cpp
@@ -0,0 +1,43 @@
+//===-- BPFSelectionDAGInfo.cpp - BPF SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPFSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-selectiondag-info"
+
+SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  // Requires the copy size to be a constant.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+
+  unsigned CopyLen = ConstantSize->getZExtValue();
+  unsigned StoresNumEstimate = alignTo(CopyLen, Align) >> Log2_32(Align);
+  // Impose the same copy length limit as MaxStoresPerMemcpy.
+  if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
+                    DAG.getConstant(CopyLen, dl, MVT::i64),
+                    DAG.getConstant(Align, dl, MVT::i64));
+
+  return Dst.getValue(0);
+}
diff --git a/lib/Target/BPF/BPFSelectionDAGInfo.h b/lib/Target/BPF/BPFSelectionDAGInfo.h
new file mode 100644
index 00000000000..19d3c576957
--- /dev/null
+++ b/lib/Target/BPF/BPFSelectionDAGInfo.h
@@ -0,0 +1,36 @@
+//===-- BPFSelectionDAGInfo.h - BPF SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the BPF subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+
+  unsigned getCommonMaxStoresPerMemFunc() const { return 128; }
+
+};
+
+}
+
+#endif
diff --git a/lib/Target/BPF/BPFSubtarget.h b/lib/Target/BPF/BPFSubtarget.h
index 067b22ad3ce..60e56435fe4 100644
--- a/lib/Target/BPF/BPFSubtarget.h
+++ b/lib/Target/BPF/BPFSubtarget.h
@@ -17,6 +17,7 @@
 #include "BPFFrameLowering.h"
 #include "BPFISelLowering.h"
 #include "BPFInstrInfo.h"
+#include "BPFSelectionDAGInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -33,7 +34,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   BPFInstrInfo InstrInfo;
   BPFFrameLowering FrameLowering;
   BPFTargetLowering TLInfo;
-  SelectionDAGTargetInfo TSInfo;
+  BPFSelectionDAGInfo TSInfo;
 
 private:
   void initializeEnvironment();
@@ -75,7 +76,7 @@ public:
   const BPFTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+  const BPFSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
   const TargetRegisterInfo *getRegisterInfo() const override {
diff --git a/lib/Target/BPF/CMakeLists.txt b/lib/Target/BPF/CMakeLists.txt
index 7e53c6c4396..ee01b4b7b80 100644
--- a/lib/Target/BPF/CMakeLists.txt
+++ b/lib/Target/BPF/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_target(BPFCodeGen
   BPFISelLowering.cpp
   BPFMCInstLower.cpp
   BPFRegisterInfo.cpp
+  BPFSelectionDAGInfo.cpp
   BPFSubtarget.cpp
   BPFTargetMachine.cpp
   BPFMIPeephole.cpp
diff --git a/test/CodeGen/BPF/memcpy-expand-in-order.ll b/test/CodeGen/BPF/memcpy-expand-in-order.ll
new file mode 100644
index 00000000000..6ee31264c76
--- /dev/null
+++ b/test/CodeGen/BPF/memcpy-expand-in-order.ll
@@ -0,0 +1,116 @@
+; RUN: llc < %s -march=bpfel -bpf-expand-memcpy-in-order | FileCheck %s
+; RUN: llc < %s -march=bpfeb -bpf-expand-memcpy-in-order | FileCheck %s
+;
+; #define COPY_LEN	9
+;
+; void cal_align1(void *a, void *b)
+; {
+;   __builtin_memcpy(a, b, COPY_LEN);
+; }
+;
+; void cal_align2(short *a, short *b)
+; {
+;   __builtin_memcpy(a, b, COPY_LEN);
+; }
+;
+; #undef COPY_LEN
+; #define COPY_LEN	19
+; void cal_align4(int *a, int *b)
+; {
+;   __builtin_memcpy(a, b, COPY_LEN);
+; }
+;
+; #undef COPY_LEN
+; #define COPY_LEN	27
+; void cal_align8(long long *a, long long *b)
+; {
+;   __builtin_memcpy(a, b, COPY_LEN);
+; }
+
+; Function Attrs: nounwind
+define dso_local void @cal_align1(i8* nocapture %a, i8* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 9, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
+
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u8 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u8 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 1)
+; CHECK: *(u8 *)([[DST_REG]] + 1) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 2)
+; CHECK: *(u8 *)([[DST_REG]] + 2) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 3)
+; CHECK: *(u8 *)([[DST_REG]] + 3) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 4)
+; CHECK: *(u8 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 5)
+; CHECK: *(u8 *)([[DST_REG]] + 5) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 6)
+; CHECK: *(u8 *)([[DST_REG]] + 6) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 7)
+; CHECK: *(u8 *)([[DST_REG]] + 7) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 8)
+; CHECK: *(u8 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+
+; Function Attrs: nounwind
+define dso_local void @cal_align2(i16* nocapture %a, i16* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i16* %a to i8*
+  %1 = bitcast i16* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 %0, i8* align 2 %1, i64 9, i1 false)
+  ret void
+}
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u16 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u16 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 2)
+; CHECK: *(u16 *)([[DST_REG]] + 2) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 4)
+; CHECK: *(u16 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 6)
+; CHECK: *(u16 *)([[DST_REG]] + 6) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 8)
+; CHECK: *(u8 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+
+; Function Attrs: nounwind
+define dso_local void @cal_align4(i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast i32* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 19, i1 false)
+  ret void
+}
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u32 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u32 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 4)
+; CHECK: *(u32 *)([[DST_REG]] + 4) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 8)
+; CHECK: *(u32 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u32 *)([[SRC_REG]] + 12)
+; CHECK: *(u32 *)([[DST_REG]] + 12) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 16)
+; CHECK: *(u16 *)([[DST_REG]] + 16) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 18)
+; CHECK: *(u8 *)([[DST_REG]] + 18) = [[SCRATCH_REG]]
+
+; Function Attrs: nounwind
+define dso_local void @cal_align8(i64* nocapture %a, i64* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i64* %a to i8*
+  %1 = bitcast i64* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 %1, i64 27, i1 false)
+  ret void
+}
+; CHECK: [[SCRATCH_REG:r[0-9]]] = *(u64 *)([[SRC_REG:r[0-9]]] + 0)
+; CHECK: *(u64 *)([[DST_REG:r[0-9]]] + 0) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u64 *)([[SRC_REG]] + 8)
+; CHECK: *(u64 *)([[DST_REG]] + 8) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u64 *)([[SRC_REG]] + 16)
+; CHECK: *(u64 *)([[DST_REG]] + 16) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u16 *)([[SRC_REG]] + 24)
+; CHECK: *(u16 *)([[DST_REG]] + 24) = [[SCRATCH_REG]]
+; CHECK: [[SCRATCH_REG]] = *(u8 *)([[SRC_REG]] + 26)
+; CHECK: *(u8 *)([[DST_REG]] + 26) = [[SCRATCH_REG]]