diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index f6884d5a271..b20516e3a4c 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2433,6 +2433,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
                                       SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
+  bool isLittleEndian = Subtarget.isLittleEndian();
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
@@ -2533,7 +2534,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
       }
 
       // All aggregates smaller than 8 bytes must be passed right-justified.
-      if (ObjSize < PtrByteSize)
+      if (ObjSize < PtrByteSize && !isLittleEndian)
         CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
       // The value of the object is its address.
       int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
@@ -2683,9 +2684,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     // We need to load the argument to a virtual register if we determined
     // above that we ran out of physical registers of the appropriate type.
     if (needsLoad) {
-      int FI = MFI->CreateFixedObject(ObjSize,
-                                      CurArgOffset + (ArgSize - ObjSize),
-                                      isImmutable);
+      if (ObjSize < ArgSize && !isLittleEndian)
+        CurArgOffset += ArgSize - ObjSize;
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
                            false, false, false, 0);
@@ -4034,6 +4035,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                     SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
+  bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -4177,9 +4179,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       }
 
       if (GPR_idx == NumGPRs && Size < 8) {
-        SDValue Const = DAG.getConstant(PtrByteSize - Size,
-                                        PtrOff.getValueType());
-        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        SDValue AddPtr = PtrOff;
+        if (!isLittleEndian) {
+          SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                          PtrOff.getValueType());
+          AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        }
         Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
                                                           CallSeqStart,
                                                           Flags, DAG, dl);
@@ -4214,8 +4219,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
         // small aggregates, particularly for packed ones.
         // FIXME: It would be preferable to use the slot in the
         // parameter save area instead of a new local variable.
-        SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
-        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        SDValue AddPtr = PtrOff;
+        if (!isLittleEndian) {
+          SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
+          AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        }
         Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
                                                           CallSeqStart,
                                                           Flags, DAG, dl);
@@ -4276,7 +4284,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
           // must be passed right-justified in the stack doubleword, and
           // in the GPR, if one is available.
           SDValue StoreOff;
-          if (Arg.getSimpleValueType().SimpleTy == MVT::f32) {
+          if (Arg.getSimpleValueType().SimpleTy == MVT::f32 &&
+              !isLittleEndian) {
             SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
             StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
           } else
@@ -4300,7 +4309,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       } else {
         // Single-precision floating-point values are mapped to the
         // second (rightmost) word of the stack doubleword.
-        if (Arg.getValueType() == MVT::f32) {
+        if (Arg.getValueType() == MVT::f32 && !isLittleEndian) {
           SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
           PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
         }
diff --git a/test/CodeGen/PowerPC/ppc64-smallarg.ll b/test/CodeGen/PowerPC/ppc64-smallarg.ll
new file mode 100644
index 00000000000..0d5b078e217
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-smallarg.ll
@@ -0,0 +1,59 @@
+; Verify that small structures and float arguments are passed in the
+; least significant part of a stack slot doubleword.
+
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.large_arg = type { [8 x i64] }
+%struct.small_arg = type { i16, i8 }
+
+@gl = common global %struct.large_arg zeroinitializer, align 8
+@gs = common global %struct.small_arg zeroinitializer, align 2
+@gf = common global float 0.000000e+00, align 4
+
+define void @callee1(%struct.small_arg* noalias nocapture sret %agg.result, %struct.large_arg* byval nocapture readnone %pad, %struct.small_arg* byval nocapture readonly %x) {
+entry:
+  %0 = bitcast %struct.small_arg* %x to i32*
+  %1 = bitcast %struct.small_arg* %agg.result to i32*
+  %2 = load i32* %0, align 2
+  store i32 %2, i32* %1, align 2
+  ret void
+}
+; CHECK: @callee1
+; CHECK: lwz {{[0-9]+}}, 124(1)
+; CHECK: blr
+
+define void @caller1() {
+entry:
+  %tmp = alloca %struct.small_arg, align 2
+  call void @test1(%struct.small_arg* sret %tmp, %struct.large_arg* byval @gl, %struct.small_arg* byval @gs)
+  ret void
+}
+; CHECK: @caller1
+; CHECK: stw {{[0-9]+}}, 124(1)
+; CHECK: bl test1
+
+declare void @test1(%struct.small_arg* sret, %struct.large_arg* byval, %struct.small_arg* byval)
+
+define float @callee2(float %pad1, float %pad2, float %pad3, float %pad4, float %pad5, float %pad6, float %pad7, float %pad8, float %pad9, float %pad10, float %pad11, float %pad12, float %pad13, float %x) {
+entry:
+  ret float %x
+}
+; CHECK: @callee2
+; CHECK: lfs {{[0-9]+}}, 156(1)
+; CHECK: blr
+
+define void @caller2() {
+entry:
+  %0 = load float* @gf, align 4
+  %call = tail call float @test2(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float %0)
+  ret void
+}
+; CHECK: @caller2
+; CHECK: stfs {{[0-9]+}}, 156(1)
+; CHECK: bl test2
+
+declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float)
+
diff --git a/test/CodeGen/PowerPC/ppc64le-smallarg.ll b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
new file mode 100644
index 00000000000..fcb1e9200dd
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
@@ -0,0 +1,59 @@
+; Verify that small structures and float arguments are passed in the
+; least significant part of a stack slot doubleword.
+
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%struct.large_arg = type { [8 x i64] }
+%struct.small_arg = type { i16, i8 }
+
+@gl = common global %struct.large_arg zeroinitializer, align 8
+@gs = common global %struct.small_arg zeroinitializer, align 2
+@gf = common global float 0.000000e+00, align 4
+
+define void @callee1(%struct.small_arg* noalias nocapture sret %agg.result, %struct.large_arg* byval nocapture readnone %pad, %struct.small_arg* byval nocapture readonly %x) {
+entry:
+  %0 = bitcast %struct.small_arg* %x to i32*
+  %1 = bitcast %struct.small_arg* %agg.result to i32*
+  %2 = load i32* %0, align 2
+  store i32 %2, i32* %1, align 2
+  ret void
+}
+; CHECK: @callee1
+; CHECK: lwz {{[0-9]+}}, 120(1)
+; CHECK: blr
+
+define void @caller1() {
+entry:
+  %tmp = alloca %struct.small_arg, align 2
+  call void @test1(%struct.small_arg* sret %tmp, %struct.large_arg* byval @gl, %struct.small_arg* byval @gs)
+  ret void
+}
+; CHECK: @caller1
+; CHECK: stw {{[0-9]+}}, 120(1)
+; CHECK: bl test1
+
+declare void @test1(%struct.small_arg* sret, %struct.large_arg* byval, %struct.small_arg* byval)
+
+define float @callee2(float %pad1, float %pad2, float %pad3, float %pad4, float %pad5, float %pad6, float %pad7, float %pad8, float %pad9, float %pad10, float %pad11, float %pad12, float %pad13, float %x) {
+entry:
+  ret float %x
+}
+; CHECK: @callee2
+; CHECK: lfs {{[0-9]+}}, 152(1)
+; CHECK: blr
+
+define void @caller2() {
+entry:
+  %0 = load float* @gf, align 4
+  %call = tail call float @test2(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float %0)
+  ret void
+}
+; CHECK: @caller2
+; CHECK: stfs {{[0-9]+}}, 152(1)
+; CHECK: bl test2
+
+declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float)
+