From 257fa900018e20955270caea393a3470b98926a6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Jan 2019 23:24:08 +0000 Subject: [PATCH] [X86] Add load folding support to the custom isel we do for X86ISD::UMUL/SMUL. The peephole pass isn't always able to fold the load because it can't commute the implicit usage of AL/AX/EAX/RAX. llvm-svn: 350272 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 70 ++++++++++++++++++++------ test/CodeGen/X86/umul-with-overflow.ll | 5 +- test/CodeGen/X86/xmulo.ll | 38 +++++++------- 3 files changed, 76 insertions(+), 37 deletions(-) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 72439946771..22ef9b08a76 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -3454,31 +3454,73 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - unsigned LoReg, Opc; + unsigned LoReg, ROpc, MOpc; switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: LoReg = X86::AL; - Opc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; + ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; + MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; break; - case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break; - case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break; - case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break; + case MVT::i16: + LoReg = X86::AX; + ROpc = X86::MUL16r; + MOpc = X86::MUL16m; + break; + case MVT::i32: + LoReg = X86::EAX; + ROpc = X86::MUL32r; + MOpc = X86::MUL32m; + break; + case MVT::i64: + LoReg = X86::RAX; + ROpc = X86::MUL64r; + MOpc = X86::MUL64m; + break; + } + + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + // Multiply is commmutative. + if (!FoldedLoad) { + FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + if (FoldedLoad) + std::swap(N0, N1); } SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, N0, SDValue()).getValue(1); - // i16/i32/i64 use an instruction that produces a low and high result even - // though only the low result is used. - SDVTList VTs; - if (NVT == MVT::i8) - VTs = CurDAG->getVTList(NVT, MVT::i32); - else - VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); + MachineSDNode *CNode; + if (FoldedLoad) { + // i16/i32/i64 use an instruction that produces a low and high result even + // though only the low result is used. + SDVTList VTs; + if (NVT == MVT::i8) + VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); + else + VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other); + + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), + InFlag }; + CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + + // Update the chain. + ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); + // Record the mem-refs + CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); + } else { + // i16/i32/i64 use an instruction that produces a low and high result even + // though only the low result is used. + SDVTList VTs; + if (NVT == MVT::i8) + VTs = CurDAG->getVTList(NVT, MVT::i32); + else + VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); + + CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag}); + } - SDValue Ops[] = {N1, InFlag}; - SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); CurDAG->RemoveDeadNode(Node); diff --git a/test/CodeGen/X86/umul-with-overflow.ll b/test/CodeGen/X86/umul-with-overflow.ll index 72b1fcceb92..64a8933346e 100644 --- a/test/CodeGen/X86/umul-with-overflow.ll +++ b/test/CodeGen/X86/umul-with-overflow.ll @@ -7,9 +7,8 @@ declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b) define zeroext i1 @a(i32 %x) nounwind { ; X86-LABEL: a: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $3, %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movl $3, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: seto %al ; X86-NEXT: retl ; diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll index 87dba8c87b5..86b15cce036 100644 --- a/test/CodeGen/X86/xmulo.ll +++ b/test/CodeGen/X86/xmulo.ll @@ -725,8 +725,9 @@ define i1 @bug27873(i64 %c1, i1 %c2) { define zeroext i1 @smuloi8_load(i8* %ptr1, i8 %v2, i8* %res) { ; SDAG-LABEL: smuloi8_load: ; SDAG: ## %bb.0: -; SDAG-NEXT: movb (%rdi), %al -; SDAG-NEXT: imulb %sil +; SDAG-NEXT: movl %esi, %eax +; SDAG-NEXT: ## kill: def $al killed $al killed $eax +; SDAG-NEXT: imulb (%rdi) ; SDAG-NEXT: seto %cl ; SDAG-NEXT: movb %al, (%rdx) ; SDAG-NEXT: movl %ecx, %eax @@ -753,9 +754,8 @@ define zeroext i1 @smuloi8_load2(i8 %v1, i8* %ptr2, i8* %res) { ; SDAG-LABEL: smuloi8_load2: ; SDAG: ## %bb.0: ; SDAG-NEXT: movl %edi, %eax -; SDAG-NEXT: movb (%rsi), %cl ; SDAG-NEXT: ## kill: def $al killed $al killed $eax -; SDAG-NEXT: imulb %cl +; SDAG-NEXT: imulb (%rsi) ; SDAG-NEXT: seto %cl ; SDAG-NEXT: movb %al, (%rdx) ; SDAG-NEXT: movl %ecx, %eax @@ -926,8 +926,9 @@ define zeroext i1 @smuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) { define zeroext i1 @umuloi8_load(i8* %ptr1, i8 %v2, i8* %res) { ; SDAG-LABEL: umuloi8_load: ; SDAG: ## %bb.0: -; SDAG-NEXT: movb (%rdi), %al -; SDAG-NEXT: mulb %sil +; SDAG-NEXT: movl %esi, %eax +; SDAG-NEXT: ## kill: def $al killed $al killed $eax +; SDAG-NEXT: mulb (%rdi) ; SDAG-NEXT: seto %cl ; SDAG-NEXT: movb %al, (%rdx) ; SDAG-NEXT: movl %ecx, %eax @@ -954,9 +955,8 @@ define zeroext i1 @umuloi8_load2(i8 %v1, i8* %ptr2, i8* %res) { ; SDAG-LABEL: umuloi8_load2: ; SDAG: ## %bb.0: ; SDAG-NEXT: movl %edi, %eax -; SDAG-NEXT: movb (%rsi), %cl ; SDAG-NEXT: ## kill: def $al killed $al killed $eax -; SDAG-NEXT: mulb %cl +; SDAG-NEXT: mulb (%rsi) ; SDAG-NEXT: seto %cl ; SDAG-NEXT: movb %al, (%rdx) ; SDAG-NEXT: movl %ecx, %eax @@ -984,8 +984,9 @@ define zeroext i1 @umuloi16_load(i16* %ptr1, i16 %v2, i16* %res) { ; SDAG-LABEL: umuloi16_load: ; SDAG: ## %bb.0: ; SDAG-NEXT: movq %rdx, %rcx -; SDAG-NEXT: movzwl (%rdi), %eax -; SDAG-NEXT: mulw %si +; SDAG-NEXT: movl %esi, %eax +; SDAG-NEXT: ## kill: def $ax killed $ax killed $eax +; SDAG-NEXT: mulw (%rdi) ; SDAG-NEXT: seto %dl ; SDAG-NEXT: movw %ax, (%rcx) ; SDAG-NEXT: movl %edx, %eax @@ -1014,9 +1015,8 @@ define zeroext i1 @umuloi16_load2(i16 %v1, i16* %ptr2, i16* %res) { ; SDAG: ## %bb.0: ; SDAG-NEXT: movq %rdx, %rcx ; SDAG-NEXT: movl %edi, %eax -; SDAG-NEXT: movzwl (%rsi), %edx ; SDAG-NEXT: ## kill: def $ax killed $ax killed $eax -; SDAG-NEXT: mulw %dx +; SDAG-NEXT: mulw (%rsi) ; SDAG-NEXT: seto %dl ; SDAG-NEXT: movw %ax, (%rcx) ; SDAG-NEXT: movl %edx, %eax @@ -1045,8 +1045,8 @@ define zeroext i1 @umuloi32_load(i32* %ptr1, i32 %v2, i32* %res) { ; SDAG-LABEL: umuloi32_load: ; SDAG: ## %bb.0: ; SDAG-NEXT: movq %rdx, %rcx -; SDAG-NEXT: movl (%rdi), %eax -; SDAG-NEXT: mull %esi +; SDAG-NEXT: movl %esi, %eax +; SDAG-NEXT: mull (%rdi) ; SDAG-NEXT: seto %dl ; SDAG-NEXT: movl %eax, (%rcx) ; SDAG-NEXT: movl %edx, %eax @@ -1075,8 +1075,7 @@ define zeroext i1 @umuloi32_load2(i32 %v1, i32* %ptr2, i32* %res) { ; SDAG: ## %bb.0: ; SDAG-NEXT: movq %rdx, %rcx ; SDAG-NEXT: movl %edi, %eax -; SDAG-NEXT: movl (%rsi), %edx -; SDAG-NEXT: mull %edx +; SDAG-NEXT: mull (%rsi) ; SDAG-NEXT: seto %dl ; SDAG-NEXT: movl %eax, (%rcx) ; SDAG-NEXT: movl %edx, %eax @@ -1104,8 +1103,8 @@ define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) { ; SDAG-LABEL: umuloi64_load: ; SDAG: ## %bb.0: ; SDAG-NEXT: movq %rdx, %rcx -; SDAG-NEXT: movq (%rdi), %rax -; SDAG-NEXT: mulq %rsi +; SDAG-NEXT: movq %rsi, %rax +; SDAG-NEXT: mulq (%rdi) ; SDAG-NEXT: seto %dl ; SDAG-NEXT: movq %rax, (%rcx) ; SDAG-NEXT: movl %edx, %eax @@ -1134,8 +1133,7 @@ define zeroext i1 @umuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) { ; SDAG: ## %bb.0: ; SDAG-NEXT: movq %rdx, %rcx ; SDAG-NEXT: movq %rdi, %rax -; SDAG-NEXT: movq (%rsi), %rdx -; SDAG-NEXT: mulq %rdx +; SDAG-NEXT: mulq (%rsi) ; SDAG-NEXT: seto %dl ; SDAG-NEXT: movq %rax, (%rcx) ; SDAG-NEXT: movl %edx, %eax