[X86] Disable copy elision in LowerMemArgument for scalarized vectors when the loc VT is a different size than the original element.

For example a v4f16 argument is scalarized to 4 i32 values. So the values are spread out instead of being packed tightly like in the original vector. Fixes PR47000.
2025-01-31 12:41:49 +01:00 · 2020-08-05 15:35:16 -07:00 · 2020-08-05 15:35:16 -07:00 · a04fa612fc
commit a04fa612fc
parent ce39c28b26
2 changed files with 18 additions and 10 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -3203,13 +3203,23 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
    return DAG.getFrameIndex(FI, PtrVT);
  }

+  EVT ArgVT = Ins[i].ArgVT;
+
+  // If this is a vector that has been split into multiple parts, and the
+  // scalar size of the parts don't match the vector element size, then we can't
+  // elide the copy. The parts will have padding between them instead of being
+  // packed like a vector.
+  bool ScalarizedAndExtendedVector =
+      ArgVT.isVector() && !VA.getLocVT().isVector() &&
+      VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
+
  // This is an argument in memory. We might be able to perform copy elision.
  // If the argument is passed directly in memory without any extension, then we
  // can perform copy elision. Large vector types, for example, may be passed
  // indirectly by pointer.
  if (Flags.isCopyElisionCandidate() &&
-      VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
-    EVT ArgVT = Ins[i].ArgVT;
+      VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
+      !ScalarizedAndExtendedVector) {
    SDValue PartAddr;
    if (Ins[i].PartOffset == 0) {
      // If this is a one-part value or the first part of a multi-part value,
--- a/test/CodeGen/X86/pr47000.ll
+++ b/test/CodeGen/X86/pr47000.ll
@ -16,17 +16,15 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movw 176(%esp), %dx
 ; CHECK-NEXT:    movw 172(%esp), %si
-; CHECK-NEXT:    movw 164(%esp), %di
-; CHECK-NEXT:    movw 166(%esp), %bx
+; CHECK-NEXT:    movw 168(%esp), %di
+; CHECK-NEXT:    movw 164(%esp), %bx
 ; CHECK-NEXT:    movw 160(%esp), %bp
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movw 156(%esp), %ax
 ; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw 152(%esp), %ax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
 ; CHECK-NEXT:    movw 148(%esp), %ax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movw 150(%esp), %ax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
 ; CHECK-NEXT:    movw %ax, 112(%esp)
 ; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
 ; CHECK-NEXT:    movw %ax, 114(%esp)
@ -35,8 +33,8 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    movw %bp, 118(%esp)
 ; CHECK-NEXT:    movw %dx, 110(%esp)
 ; CHECK-NEXT:    movw %si, 108(%esp)
-; CHECK-NEXT:    movw %bx, 106(%esp)
-; CHECK-NEXT:    movw %di, 104(%esp)
+; CHECK-NEXT:    movw %di, 106(%esp)
+; CHECK-NEXT:    movw %bx, 104(%esp)
 ; CHECK-NEXT:    movzwl 118(%esp), %edx
 ; CHECK-NEXT:    movzwl 116(%esp), %esi
 ; CHECK-NEXT:    movzwl 114(%esp), %edi