1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[SelectionDAG] Combine U{ADD,SUB}O diamonds into {ADD,SUB}CARRY

Summary:
Convert (uaddo (uaddo x, y), carryIn) into addcarry x, y, carryIn if-and-only-if the carry flags of the first two uaddo are merged via OR or XOR.

Work remaining: match ADD, etc.

Reviewers: craig.topper, RKSimon, spatel, niravd, jonpa, uweigand, deadalnix, nikic, lebedev.ri, dmgreen, chfast

Reviewed By: lebedev.ri

Subscribers: chfast, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70079
This commit is contained in:
David Zarzycki 2019-11-20 15:52:24 +02:00
parent 52ebfb3921
commit 87c9c2362c
3 changed files with 232 additions and 298 deletions

View File

@ -2802,6 +2802,96 @@ static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
return SDValue();
}
// If we are facing some sort of diamond carry/borrow in/out pattern try to
// match patterns like:
//
// (uaddo A, B) CarryIn
// | \ |
// | \ |
// PartialSum PartialCarryOutX /
// | | /
// | ____|____________/
// | / |
// (uaddo *, *) \________
// | \ \
// | \ |
// | PartialCarryOutY |
// | \ |
// | \ /
// AddCarrySum | ______/
// | /
// CarryOut = (or *, *)
//
// And generate ADDCARRY (or SUBCARRY) with two result values:
//
// {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn)
//
// Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with
// a single path for carry/borrow out propagation:
static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
const TargetLowering &TLI, SDValue Carry0,
SDValue Carry1, SDNode *N) {
if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1)
return SDValue();
unsigned Opcode = Carry0.getOpcode();
if (Opcode != Carry1.getOpcode())
return SDValue();
if (Opcode != ISD::UADDO && Opcode != ISD::USUBO)
return SDValue();
// Canonicalize the add/sub of A and B as Carry0 and the add/sub of the
// carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in
// the above ASCII art.)
if (Carry1.getOperand(0) != Carry0.getValue(0) &&
Carry1.getOperand(1) != Carry0.getValue(0))
std::swap(Carry0, Carry1);
if (Carry1.getOperand(0) != Carry0.getValue(0) &&
Carry1.getOperand(1) != Carry0.getValue(0))
return SDValue();
// The carry in value must be on the righthand side for subtraction.
unsigned CarryInOperandNum =
Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0;
if (Opcode == ISD::USUBO && CarryInOperandNum != 1)
return SDValue();
SDValue CarryIn = Carry1.getOperand(CarryInOperandNum);
unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY;
if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType()))
return SDValue();
// Verify that the carry/borrow in is plausibly a carry/borrow bit.
// TODO: make getAsCarry() aware of how partial carries are merged.
if (CarryIn.getOpcode() != ISD::ZERO_EXTEND)
return SDValue();
CarryIn = CarryIn.getOperand(0);
if (CarryIn.getValueType() != MVT::i1)
return SDValue();
SDLoc DL(N);
SDValue Merged =
DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0),
Carry0.getOperand(1), CarryIn);
// Please note that because we have proven that the result of the UADDO/USUBO
// of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can
// therefore prove that if the first UADDO/USUBO overflows, the second
// UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the
// maximum value.
//
// 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry
// 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow)
//
// This is important because it means that OR and XOR can be used to merge
// carry flags; and that AND can return a constant zero.
//
// TODO: match other operations that can merge flags (ADD, etc)
DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0));
if (N->getOpcode() == ISD::AND)
return DAG.getConstant(0, DL, MVT::i1);
return Merged.getValue(1);
}
SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
SDNode *N) {
// fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry.
@ -5093,6 +5183,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
if (SDValue Shuffle = XformToShuffleWithZero(N))
return Shuffle;
if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
return Combined;
// fold (and (or x, C), D) -> D if (C & D) == D
auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
@ -5787,6 +5880,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
if (SDValue Combined = visitORLike(N0, N1, N))
return Combined;
if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
return Combined;
// Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
return BSwap;
@ -7049,6 +7145,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
return Combined;
return SDValue();
}

View File

@ -511,40 +511,13 @@ define i32 @add_U320_without_i128_add(%struct.U320* nocapture dereferenceable(40
define i32 @add_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: add_U320_without_i128_or:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: addq 8(%rdi), %rdx
; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, (%rdi)
; CHECK-NEXT: adcq $0, %rdx
; CHECK-NEXT: adcq %rdx, 8(%rdi)
; CHECK-NEXT: adcq %rcx, 16(%rdi)
; CHECK-NEXT: adcq %r8, 24(%rdi)
; CHECK-NEXT: adcq %r9, 32(%rdi)
; CHECK-NEXT: setb %al
; CHECK-NEXT: addq 16(%rdi), %rcx
; CHECK-NEXT: setb %r11b
; CHECK-NEXT: orb %r10b, %al
; CHECK-NEXT: movzbl %al, %ebx
; CHECK-NEXT: addq %rcx, %rbx
; CHECK-NEXT: setb %cl
; CHECK-NEXT: addq 24(%rdi), %r8
; CHECK-NEXT: setb %r10b
; CHECK-NEXT: orb %r11b, %cl
; CHECK-NEXT: movzbl %cl, %esi
; CHECK-NEXT: addq %r8, %rsi
; CHECK-NEXT: setb %al
; CHECK-NEXT: addq 32(%rdi), %r9
; CHECK-NEXT: setb %r8b
; CHECK-NEXT: orb %r10b, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: addq %r9, %rax
; CHECK-NEXT: setb %cl
; CHECK-NEXT: movq %rdx, 8(%rdi)
; CHECK-NEXT: movq %rbx, 16(%rdi)
; CHECK-NEXT: movq %rsi, 24(%rdi)
; CHECK-NEXT: movq %rax, 32(%rdi)
; CHECK-NEXT: orb %r8b, %cl
; CHECK-NEXT: movzbl %cl, %eax
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
@ -594,40 +567,13 @@ define i32 @add_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40)
define i32 @add_U320_without_i128_xor(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: add_U320_without_i128_xor:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: addq 8(%rdi), %rdx
; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, (%rdi)
; CHECK-NEXT: adcq $0, %rdx
; CHECK-NEXT: adcq %rdx, 8(%rdi)
; CHECK-NEXT: adcq %rcx, 16(%rdi)
; CHECK-NEXT: adcq %r8, 24(%rdi)
; CHECK-NEXT: adcq %r9, 32(%rdi)
; CHECK-NEXT: setb %al
; CHECK-NEXT: addq 16(%rdi), %rcx
; CHECK-NEXT: setb %r11b
; CHECK-NEXT: xorb %r10b, %al
; CHECK-NEXT: movzbl %al, %ebx
; CHECK-NEXT: addq %rcx, %rbx
; CHECK-NEXT: setb %cl
; CHECK-NEXT: addq 24(%rdi), %r8
; CHECK-NEXT: setb %r10b
; CHECK-NEXT: xorb %r11b, %cl
; CHECK-NEXT: movzbl %cl, %esi
; CHECK-NEXT: addq %r8, %rsi
; CHECK-NEXT: setb %al
; CHECK-NEXT: addq 32(%rdi), %r9
; CHECK-NEXT: setb %r8b
; CHECK-NEXT: xorb %r10b, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: addq %r9, %rax
; CHECK-NEXT: setb %cl
; CHECK-NEXT: movq %rdx, 8(%rdi)
; CHECK-NEXT: movq %rbx, 16(%rdi)
; CHECK-NEXT: movq %rsi, 24(%rdi)
; CHECK-NEXT: movq %rax, 32(%rdi)
; CHECK-NEXT: xorb %r8b, %cl
; CHECK-NEXT: movzbl %cl, %eax
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
@ -674,34 +620,71 @@ define i32 @add_U320_without_i128_xor(%struct.U320* nocapture dereferenceable(40
ret i32 %43
}
; Either the primary addition can overflow or the addition of the carry, but
; they cannot both overflow.
define i32 @bogus_add_U320_without_i128_and(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: bogus_add_U320_without_i128_and:
; CHECK: # %bb.0:
; CHECK-NEXT: addq %rsi, (%rdi)
; CHECK-NEXT: adcq %rdx, 8(%rdi)
; CHECK-NEXT: addq %rcx, 16(%rdi)
; CHECK-NEXT: addq %r8, 24(%rdi)
; CHECK-NEXT: addq %r9, 32(%rdi)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
%9 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 1
%10 = load i64, i64* %9, align 8
%11 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 2
%12 = load i64, i64* %11, align 8
%13 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 3
%14 = load i64, i64* %13, align 8
%15 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 4
%16 = load i64, i64* %15, align 8
%17 = add i64 %8, %1
%18 = add i64 %10, %2
%19 = icmp ult i64 %17, %1
%20 = zext i1 %19 to i64
%21 = add i64 %18, %20
%22 = add i64 %12, %3
%23 = icmp ult i64 %18, %10
%24 = icmp ult i64 %21, %18
%25 = and i1 %23, %24
%26 = zext i1 %25 to i64
%27 = add i64 %22, %26
%28 = add i64 %14, %4
%29 = icmp ult i64 %22, %12
%30 = icmp ult i64 %27, %22
%31 = and i1 %29, %30
%32 = zext i1 %31 to i64
%33 = add i64 %28, %32
%34 = add i64 %16, %5
%35 = icmp ult i64 %28, %14
%36 = icmp ult i64 %33, %28
%37 = and i1 %35, %36
%38 = zext i1 %37 to i64
%39 = add i64 %34, %38
store i64 %17, i64* %7, align 8
store i64 %21, i64* %9, align 8
store i64 %27, i64* %11, align 8
store i64 %33, i64* %13, align 8
store i64 %39, i64* %15, align 8
%40 = icmp ult i64 %34, %16
%41 = icmp ult i64 %39, %34
%42 = and i1 %40, %41
%43 = zext i1 %42 to i32
ret i32 %43
}
define void @add_U320_without_i128_or_no_ret(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: add_U320_without_i128_or_no_ret:
; CHECK: # %bb.0:
; CHECK-NEXT: addq 8(%rdi), %rdx
; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, (%rdi)
; CHECK-NEXT: adcq $0, %rdx
; CHECK-NEXT: setb %al
; CHECK-NEXT: addq 16(%rdi), %rcx
; CHECK-NEXT: setb %r11b
; CHECK-NEXT: orb %r10b, %al
; CHECK-NEXT: movzbl %al, %esi
; CHECK-NEXT: addq %rcx, %rsi
; CHECK-NEXT: setb %cl
; CHECK-NEXT: addq 24(%rdi), %r8
; CHECK-NEXT: setb %r10b
; CHECK-NEXT: orb %r11b, %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: addq %r8, %rcx
; CHECK-NEXT: setb %al
; CHECK-NEXT: addq 32(%rdi), %r9
; CHECK-NEXT: orb %r10b, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: addq %r9, %rax
; CHECK-NEXT: movq %rdx, 8(%rdi)
; CHECK-NEXT: movq %rsi, 16(%rdi)
; CHECK-NEXT: movq %rcx, 24(%rdi)
; CHECK-NEXT: movq %rax, 32(%rdi)
; CHECK-NEXT: adcq %rdx, 8(%rdi)
; CHECK-NEXT: adcq %rcx, 16(%rdi)
; CHECK-NEXT: adcq %r8, 24(%rdi)
; CHECK-NEXT: adcq %r9, 32(%rdi)
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
@ -747,34 +730,12 @@ define void @add_U320_without_i128_or_no_ret(%struct.U320* nocapture dereference
define i32 @add_U320_uaddo(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: add_U320_uaddo:
; CHECK: # %bb.0:
; CHECK-NEXT: addq 8(%rdi), %rdx
; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, (%rdi)
; CHECK-NEXT: adcq $0, %rdx
; CHECK-NEXT: adcq %rdx, 8(%rdi)
; CHECK-NEXT: adcq %rcx, 16(%rdi)
; CHECK-NEXT: adcq %r8, 24(%rdi)
; CHECK-NEXT: adcq %r9, 32(%rdi)
; CHECK-NEXT: setb %al
; CHECK-NEXT: orb %r10b, %al
; CHECK-NEXT: movzbl %al, %esi
; CHECK-NEXT: addq 16(%rdi), %rcx
; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, %rcx
; CHECK-NEXT: setb %al
; CHECK-NEXT: orb %r10b, %al
; CHECK-NEXT: movzbl %al, %esi
; CHECK-NEXT: addq 24(%rdi), %r8
; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, %r8
; CHECK-NEXT: setb %al
; CHECK-NEXT: orb %r10b, %al
; CHECK-NEXT: movzbl %al, %esi
; CHECK-NEXT: addq 32(%rdi), %r9
; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, %r9
; CHECK-NEXT: setb %al
; CHECK-NEXT: orb %r10b, %al
; CHECK-NEXT: movq %rdx, 8(%rdi)
; CHECK-NEXT: movq %rcx, 16(%rdi)
; CHECK-NEXT: movq %r8, 24(%rdi)
; CHECK-NEXT: movq %r9, 32(%rdi)
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
@ -838,22 +799,14 @@ define void @PR39464(%struct.U192* noalias nocapture sret %0, %struct.U192* noca
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rsi), %rcx
; CHECK-NEXT: movq (%rdx), %r8
; CHECK-NEXT: leaq (%rcx,%r8), %rdi
; CHECK-NEXT: movq %rdi, (%rax)
; CHECK-NEXT: movq 8(%rsi), %rdi
; CHECK-NEXT: addq 8(%rdx), %rdi
; CHECK-NEXT: setb %r9b
; CHECK-NEXT: addq %r8, %rcx
; CHECK-NEXT: adcq $0, %rdi
; CHECK-NEXT: setb %cl
; CHECK-NEXT: orb %r9b, %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: movq %rdi, 8(%rax)
; CHECK-NEXT: movq 16(%rsi), %rsi
; CHECK-NEXT: addq 16(%rdx), %rsi
; CHECK-NEXT: addq %rcx, %rsi
; CHECK-NEXT: movq %rsi, 16(%rax)
; CHECK-NEXT: addq (%rdx), %rcx
; CHECK-NEXT: movq %rcx, (%rdi)
; CHECK-NEXT: movq 8(%rsi), %rcx
; CHECK-NEXT: adcq 8(%rdx), %rcx
; CHECK-NEXT: movq %rcx, 8(%rdi)
; CHECK-NEXT: movq 16(%rsi), %rcx
; CHECK-NEXT: adcq 16(%rdx), %rcx
; CHECK-NEXT: movq %rcx, 16(%rdi)
; CHECK-NEXT: retq
%4 = getelementptr inbounds %struct.U192, %struct.U192* %1, i64 0, i32 0, i64 0
%5 = load i64, i64* %4, align 8
@ -896,12 +849,9 @@ define void @PR39464(%struct.U192* noalias nocapture sret %0, %struct.U192* noca
define zeroext i1 @uaddo_U128_without_i128_or(i64 %0, i64 %1, i64 %2, i64 %3, %uint128* nocapture %4) nounwind {
; CHECK-LABEL: uaddo_U128_without_i128_or:
; CHECK: # %bb.0:
; CHECK-NEXT: addq %rcx, %rsi
; CHECK-NEXT: setb %cl
; CHECK-NEXT: addq %rdx, %rdi
; CHECK-NEXT: adcq $0, %rsi
; CHECK-NEXT: adcq %rcx, %rsi
; CHECK-NEXT: setb %al
; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: movq %rsi, (%r8)
; CHECK-NEXT: movq %rdi, 8(%r8)
; CHECK-NEXT: retq
@ -927,18 +877,12 @@ define void @add_U192_without_i128_or(%uint192* sret %0, i64 %1, i64 %2, i64 %3,
; CHECK-LABEL: add_U192_without_i128_or:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: addq %r9, %rdx
; CHECK-NEXT: setb %dil
; CHECK-NEXT: addq %r8, %rsi
; CHECK-NEXT: adcq $0, %rdx
; CHECK-NEXT: setb %r8b
; CHECK-NEXT: orb %dil, %r8b
; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: movzbl %r8b, %edi
; CHECK-NEXT: addq %rcx, %rdi
; CHECK-NEXT: movq %rdi, (%rax)
; CHECK-NEXT: movq %rdx, 8(%rax)
; CHECK-NEXT: movq %rsi, 16(%rax)
; CHECK-NEXT: adcq %r9, %rdx
; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: movq %rcx, (%rdi)
; CHECK-NEXT: movq %rdx, 8(%rdi)
; CHECK-NEXT: movq %rsi, 16(%rdi)
; CHECK-NEXT: retq
%8 = add i64 %4, %1
%9 = icmp ult i64 %8, %1
@ -969,29 +913,18 @@ define void @add_U256_without_i128_or_by_i64_words(%uint256* sret %0, %uint256*
; CHECK-LABEL: add_U256_without_i128_or_by_i64_words:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rdx), %r9
; CHECK-NEXT: movq 8(%rdx), %r10
; CHECK-NEXT: addq 8(%rsi), %r10
; CHECK-NEXT: setb %r8b
; CHECK-NEXT: addq (%rsi), %r9
; CHECK-NEXT: adcq $0, %r10
; CHECK-NEXT: setb %cl
; CHECK-NEXT: orb %r8b, %cl
; CHECK-NEXT: movq 16(%rdx), %rdi
; CHECK-NEXT: addq 16(%rsi), %rdi
; CHECK-NEXT: setb %r8b
; CHECK-NEXT: movzbl %cl, %r11d
; CHECK-NEXT: addq %rdi, %r11
; CHECK-NEXT: setb %cl
; CHECK-NEXT: orb %r8b, %cl
; CHECK-NEXT: movq (%rdx), %r8
; CHECK-NEXT: movq 8(%rdx), %rdi
; CHECK-NEXT: addq (%rsi), %r8
; CHECK-NEXT: adcq 8(%rsi), %rdi
; CHECK-NEXT: movq 16(%rdx), %rcx
; CHECK-NEXT: adcq 16(%rsi), %rcx
; CHECK-NEXT: movq 24(%rdx), %rdx
; CHECK-NEXT: addq 24(%rsi), %rdx
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: addq %rdx, %rcx
; CHECK-NEXT: movq %rcx, (%rax)
; CHECK-NEXT: movq %r11, 8(%rax)
; CHECK-NEXT: movq %r10, 16(%rax)
; CHECK-NEXT: movq %r9, 24(%rax)
; CHECK-NEXT: adcq 24(%rsi), %rdx
; CHECK-NEXT: movq %rdx, (%rax)
; CHECK-NEXT: movq %rcx, 8(%rax)
; CHECK-NEXT: movq %rdi, 16(%rax)
; CHECK-NEXT: movq %r8, 24(%rax)
; CHECK-NEXT: retq
%4 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 0
%5 = load i64, i64* %4, align 8
@ -1043,24 +976,15 @@ define void @add_U256_without_i128_or_recursive(%uint256* sret %0, %uint256* %1,
; CHECK-LABEL: add_U256_without_i128_or_recursive:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rdx), %r9
; CHECK-NEXT: movq (%rdx), %r8
; CHECK-NEXT: movq 8(%rdx), %rdi
; CHECK-NEXT: addq 8(%rsi), %rdi
; CHECK-NEXT: setb %r8b
; CHECK-NEXT: addq (%rsi), %r9
; CHECK-NEXT: adcq $0, %rdi
; CHECK-NEXT: setb %cl
; CHECK-NEXT: orb %r8b, %cl
; CHECK-NEXT: movq 16(%rdx), %r8
; CHECK-NEXT: movq 24(%rdx), %r10
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: addq 16(%rsi), %r8
; CHECK-NEXT: setb %dl
; CHECK-NEXT: addq 24(%rsi), %r10
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: addq %r8, %rcx
; CHECK-NEXT: adcq %r10, %rdx
; CHECK-NEXT: movq %r9, (%rax)
; CHECK-NEXT: addq (%rsi), %r8
; CHECK-NEXT: adcq 8(%rsi), %rdi
; CHECK-NEXT: movq 16(%rdx), %rcx
; CHECK-NEXT: movq 24(%rdx), %rdx
; CHECK-NEXT: adcq 16(%rsi), %rcx
; CHECK-NEXT: adcq 24(%rsi), %rdx
; CHECK-NEXT: movq %r8, (%rax)
; CHECK-NEXT: movq %rdi, 8(%rax)
; CHECK-NEXT: movq %rcx, 16(%rax)
; CHECK-NEXT: movq %rdx, 24(%rax)

View File

@ -192,51 +192,13 @@ define i64 @sub_from_carry(i64 %x, i64 %y, i64* %valout, i64 %z) {
define i32 @sub_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: sub_U320_without_i128_or:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: .cfi_offset %rbx, -24
; CHECK-NEXT: .cfi_offset %r14, -16
; CHECK-NEXT: movq 8(%rdi), %r14
; CHECK-NEXT: movq 16(%rdi), %r10
; CHECK-NEXT: movq 24(%rdi), %r11
; CHECK-NEXT: movq 32(%rdi), %rbx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: subq %rsi, (%rdi)
; CHECK-NEXT: sbbq %rdx, 8(%rdi)
; CHECK-NEXT: sbbq %rcx, 16(%rdi)
; CHECK-NEXT: sbbq %r8, 24(%rdi)
; CHECK-NEXT: sbbq %r9, 32(%rdi)
; CHECK-NEXT: setb %al
; CHECK-NEXT: subq %rdx, %r14
; CHECK-NEXT: setb %dl
; CHECK-NEXT: subq %rax, %r14
; CHECK-NEXT: setb %al
; CHECK-NEXT: subq %rcx, %r10
; CHECK-NEXT: setb %cl
; CHECK-NEXT: orb %dl, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: subq %rax, %r10
; CHECK-NEXT: setb %al
; CHECK-NEXT: subq %r8, %r11
; CHECK-NEXT: setb %dl
; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: subq %rax, %r11
; CHECK-NEXT: setb %al
; CHECK-NEXT: subq %r9, %rbx
; CHECK-NEXT: setb %cl
; CHECK-NEXT: orb %dl, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: subq %rax, %rbx
; CHECK-NEXT: setb %al
; CHECK-NEXT: movq %r14, 8(%rdi)
; CHECK-NEXT: movq %r10, 16(%rdi)
; CHECK-NEXT: movq %r11, 24(%rdi)
; CHECK-NEXT: movq %rbx, 32(%rdi)
; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popq %r14
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
@ -286,51 +248,13 @@ define i32 @sub_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40)
define i32 @sub_U320_usubo(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: sub_U320_usubo:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: .cfi_offset %rbx, -24
; CHECK-NEXT: .cfi_offset %r14, -16
; CHECK-NEXT: movq 8(%rdi), %r14
; CHECK-NEXT: movq 16(%rdi), %r10
; CHECK-NEXT: movq 24(%rdi), %r11
; CHECK-NEXT: movq 32(%rdi), %rbx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: subq %rsi, (%rdi)
; CHECK-NEXT: sbbq %rdx, 8(%rdi)
; CHECK-NEXT: sbbq %rcx, 16(%rdi)
; CHECK-NEXT: sbbq %r8, 24(%rdi)
; CHECK-NEXT: sbbq %r9, 32(%rdi)
; CHECK-NEXT: setb %al
; CHECK-NEXT: subq %rdx, %r14
; CHECK-NEXT: setb %dl
; CHECK-NEXT: subq %rax, %r14
; CHECK-NEXT: setb %al
; CHECK-NEXT: orb %dl, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: subq %rcx, %r10
; CHECK-NEXT: setb %cl
; CHECK-NEXT: subq %rax, %r10
; CHECK-NEXT: setb %al
; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: subq %r8, %r11
; CHECK-NEXT: setb %cl
; CHECK-NEXT: subq %rax, %r11
; CHECK-NEXT: setb %al
; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: subq %r9, %rbx
; CHECK-NEXT: setb %cl
; CHECK-NEXT: subq %rax, %rbx
; CHECK-NEXT: setb %al
; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: movq %r14, 8(%rdi)
; CHECK-NEXT: movq %r10, 16(%rdi)
; CHECK-NEXT: movq %r11, 24(%rdi)
; CHECK-NEXT: movq %rbx, 32(%rdi)
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popq %r14
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
@ -393,22 +317,14 @@ define void @PR39464(%struct.U192* noalias nocapture sret %0, %struct.U192* noca
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rsi), %rcx
; CHECK-NEXT: xorl %r9d, %r9d
; CHECK-NEXT: subq (%rdx), %rcx
; CHECK-NEXT: setb %r9b
; CHECK-NEXT: movq %rcx, (%rdi)
; CHECK-NEXT: movq 8(%rsi), %rdi
; CHECK-NEXT: subq 8(%rdx), %rdi
; CHECK-NEXT: setb %r8b
; CHECK-NEXT: subq %r9, %rdi
; CHECK-NEXT: setb %cl
; CHECK-NEXT: orb %r8b, %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: movq %rdi, 8(%rax)
; CHECK-NEXT: movq 16(%rsi), %rsi
; CHECK-NEXT: subq 16(%rdx), %rsi
; CHECK-NEXT: subq %rcx, %rsi
; CHECK-NEXT: movq %rsi, 16(%rax)
; CHECK-NEXT: movq 8(%rsi), %rcx
; CHECK-NEXT: sbbq 8(%rdx), %rcx
; CHECK-NEXT: movq %rcx, 8(%rdi)
; CHECK-NEXT: movq 16(%rsi), %rcx
; CHECK-NEXT: sbbq 16(%rdx), %rcx
; CHECK-NEXT: movq %rcx, 16(%rdi)
; CHECK-NEXT: retq
%4 = getelementptr inbounds %struct.U192, %struct.U192* %1, i64 0, i32 0, i64 0
%5 = load i64, i64* %4, align 8
@ -454,28 +370,23 @@ define void @sub_U256_without_i128_or_recursive(%uint256* sret %0, %uint256* %1,
; CHECK-LABEL: sub_U256_without_i128_or_recursive:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rsi), %r8
; CHECK-NEXT: movq (%rsi), %r9
; CHECK-NEXT: movq 8(%rsi), %r10
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: subq (%rdx), %r8
; CHECK-NEXT: setb %cl
; CHECK-NEXT: subq 8(%rdx), %r10
; CHECK-NEXT: setb %r9b
; CHECK-NEXT: subq %rcx, %r10
; CHECK-NEXT: setb %cl
; CHECK-NEXT: orb %r9b, %cl
; CHECK-NEXT: movq 16(%rsi), %rdi
; CHECK-NEXT: subq (%rdx), %r9
; CHECK-NEXT: sbbq 8(%rdx), %r10
; CHECK-NEXT: setb %r8b
; CHECK-NEXT: movq 16(%rsi), %rcx
; CHECK-NEXT: movq 24(%rsi), %rsi
; CHECK-NEXT: xorl %r9d, %r9d
; CHECK-NEXT: subq 16(%rdx), %rdi
; CHECK-NEXT: setb %r9b
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: subq 16(%rdx), %rcx
; CHECK-NEXT: setb %dil
; CHECK-NEXT: subq 24(%rdx), %rsi
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: subq %rcx, %rdi
; CHECK-NEXT: sbbq %r9, %rsi
; CHECK-NEXT: movq %r8, (%rax)
; CHECK-NEXT: movzbl %r8b, %edx
; CHECK-NEXT: subq %rdx, %rcx
; CHECK-NEXT: sbbq %rdi, %rsi
; CHECK-NEXT: movq %r9, (%rax)
; CHECK-NEXT: movq %r10, 8(%rax)
; CHECK-NEXT: movq %rdi, 16(%rax)
; CHECK-NEXT: movq %rcx, 16(%rax)
; CHECK-NEXT: movq %rsi, 24(%rax)
; CHECK-NEXT: retq
%4 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 0