[x86] allow 8-bit adds to be promoted by convertToThreeAddress() to form LEA

This extends the code that handles 16-bit add promotion to form LEA to also allow 8-bit adds. That allows us to combine add ops with register moves and save some instructions. This is another step towards allowing add truncation in generic DAGCombiner (see D54640). Differential Revision: https://reviews.llvm.org/D55494 llvm-svn: 348946
2024-11-22 18:54:02 +01:00 · 2018-12-12 17:58:27 +00:00 · 2018-12-12 17:58:27 +00:00 · cfc854315e
commit cfc854315e
parent 6d994471cc
14 changed files with 70 additions and 57 deletions
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@ -913,8 +913,8 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
  let Defs = [EFLAGS] in {
    let Constraints = "$src1 = $dst" in {
      let isCommutable = CommutableRR in {
-        def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+          def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
          def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
          def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
          def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
@ -931,9 +931,9 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
      def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
      def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;

+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;

-      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
        // NOTE: These are order specific, we want the ri8 forms to be listed
        // first so that they are slightly preferred to the ri forms.
        def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@ -797,6 +797,13 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
 MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
    unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
    LiveVariables *LV) const {
+  // We handle 8-bit adds and various 16-bit opcodes in the switch below.
+  bool Is16BitOp = !(MIOpc == X86::ADD8rr || MIOpc == X86::ADD8ri);
+  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+  assert((!Is16BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
+              *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
+         "Unexpected type for LEA transform");
+
  // TODO: For a 32-bit target, we need to adjust the LEA variables with
  // something like this:
  //   Opcode = X86::LEA32r;
@ -807,13 +814,12 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
  if (!Subtarget.is64Bit())
    return nullptr;

-  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
  unsigned Opcode = X86::LEA64_32r;
  unsigned InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
  unsigned OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);

  // Build and insert into an implicit UNDEF value. This is OK because
-  // we will be shifting and then extracting the lower 16-bits.
+  // we will be shifting and then extracting the lower 8/16-bits.
  // This has the potential to cause partial register stall. e.g.
  //   movw    (%rbp,%rcx,2), %dx
  //   leal    -65(%rdx), %esi
@ -824,11 +830,12 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
  unsigned Src = MI.getOperand(1).getReg();
  bool IsDead = MI.getOperand(0).isDead();
  bool IsKill = MI.getOperand(1).isKill();
+  unsigned SubReg = Is16BitOp ? X86::sub_16bit : X86::sub_8bit;
  assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
  MachineInstr *InsMI =
      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
-          .addReg(InRegLEA, RegState::Define, X86::sub_16bit)
+          .addReg(InRegLEA, RegState::Define, SubReg)
          .addReg(Src, getKillRegState(IsKill));

  MachineInstrBuilder MIB =
@ -847,12 +854,14 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
  case X86::DEC16r:
    addRegOffset(MIB, InRegLEA, true, -1);
    break;
+  case X86::ADD8ri:
  case X86::ADD16ri:
  case X86::ADD16ri8:
  case X86::ADD16ri_DB:
  case X86::ADD16ri8_DB:
    addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
    break;
+  case X86::ADD8rr:
  case X86::ADD16rr:
  case X86::ADD16rr_DB: {
    unsigned Src2 = MI.getOperand(2).getReg();
@ -861,7 +870,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
    unsigned InRegLEA2 = 0;
    MachineInstr *InsMI2 = nullptr;
    if (Src == Src2) {
-      // ADD16rr killed %reg1028, %reg1028
+      // ADD8rr/ADD16rr killed %reg1028, %reg1028
      // just a single insert_subreg.
      addRegReg(MIB, InRegLEA, true, InRegLEA, false);
    } else {
@ -870,10 +879,10 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
      else
        InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
      // Build and insert into an implicit UNDEF value. This is OK because
-      // we will be shifting and then extracting the lower 16-bits.
+      // we will be shifting and then extracting the lower 8/16-bits.
      BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2);
      InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
-                   .addReg(InRegLEA2, RegState::Define, X86::sub_16bit)
+                   .addReg(InRegLEA2, RegState::Define, SubReg)
                   .addReg(Src2, getKillRegState(IsKill2));
      addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
    }
@ -887,7 +896,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
  MachineInstr *ExtMI =
      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
          .addReg(Dest, RegState::Define | getDeadRegState(IsDead))
-          .addReg(OutRegLEA, RegState::Kill, X86::sub_16bit);
+          .addReg(OutRegLEA, RegState::Kill, SubReg);

  if (LV) {
    // Update live variables.
@ -1084,6 +1093,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
      LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
    break;
  }
+  case X86::ADD8rr:
  case X86::ADD16rr:
  case X86::ADD16rr_DB:
    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV);
@ -1119,6 +1129,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
    NewMI = addOffset(MIB, MI.getOperand(2));
    break;
  }
+  case X86::ADD8ri:
  case X86::ADD16ri:
  case X86::ADD16ri8:
  case X86::ADD16ri_DB:
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@ -584,9 +584,9 @@ protected:
                       const MachineOperand *&Destination) const override;

 private:
-  /// This is a helper for convertToThreeAddress for 16-bit instructions.
+  /// This is a helper for convertToThreeAddress for 8 and 16-bit instructions.
  /// We use 32-bit LEA to form 3-address code by promoting to a 32-bit
-  /// super-register and then truncating back down to a 16-bit sub-register.
+  /// super-register and then truncating back down to a 8/16-bit sub-register.
  MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
                                             MachineFunction::iterator &MFI,
                                             MachineInstr &MI,
--- a/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@ -57,8 +57,9 @@ define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
 define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
 ; X64-LABEL: test_add_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    leal (%rsi,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
--- a/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll
+++ b/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll
@ -11,8 +11,9 @@ define i16 @test_shl_i4(i16 %v, i16 %a, i16 %b) {
 ; X64-LABEL: test_shl_i4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    addb %sil, %cl
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-NEXT:    leal (%rdx,%rsi), %ecx
 ; X64-NEXT:    andb $15, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shlb %cl, %al
--- a/test/CodeGen/X86/GlobalISel/shl-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/shl-scalar.ll
@ -147,8 +147,8 @@ define i8 @test_shl_i8_imm(i32 %arg1) {
 define i8 @test_shl_i8_imm1(i32 %arg1) {
 ; X64-LABEL: test_shl_i8_imm1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addb %al, %al
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
  %a = trunc i32 %arg1 to i8
--- a/test/CodeGen/X86/fixup-bw-copy.ll
+++ b/test/CodeGen/X86/fixup-bw-copy.ll
@ -43,9 +43,10 @@ define i16 @test_movw(i16 %a0) {
 define i8 @test_movb_hreg(i16 %a0) {
 ; X64-LABEL: test_movb_hreg:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shrl $8, %eax
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    leal (%rax,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
--- a/test/CodeGen/X86/fshr.ll
+++ b/test/CodeGen/X86/fshr.ll
@ -358,9 +358,9 @@ define i8 @const_shift_i8(i8 %x, i8 %y) nounwind {
 ;
 ; X64-LABEL: const_shift_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    shrb $7, %sil
-; X64-NEXT:    addb %al, %al
+; X64-NEXT:    leal (%rdi,%rdi), %eax
 ; X64-NEXT:    orb %sil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
--- a/test/CodeGen/X86/iabs.ll
+++ b/test/CodeGen/X86/iabs.ll
@ -21,10 +21,10 @@ define i8 @test_i8(i8 %a) nounwind {
 ;
 ; X64-LABEL: test_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    sarb $7, %cl
-; X64-NEXT:    addb %cl, %al
+; X64-NEXT:    leal (%rdi,%rcx), %eax
 ; X64-NEXT:    xorb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
--- a/test/CodeGen/X86/mul-constant-i8.ll
+++ b/test/CodeGen/X86/mul-constant-i8.ll
@ -14,8 +14,8 @@ define i8 @test_mul_by_1(i8 %x) {
 define i8 @test_mul_by_2(i8 %x) {
 ; X64-LABEL: test_mul_by_2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addb %al, %al
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rdi), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
  %m = mul i8 %x, 2
--- a/test/CodeGen/X86/popcnt.ll
+++ b/test/CodeGen/X86/popcnt.ll
@ -25,6 +25,7 @@ define i8 @cnt8(i8 %x) nounwind readnone {
 ;
 ; X64-LABEL: cnt8:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shrb %al
 ; X64-NEXT:    andb $85, %al
@ -36,8 +37,9 @@ define i8 @cnt8(i8 %x) nounwind readnone {
 ; X64-NEXT:    addb %al, %dil
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shrb $4, %al
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    leal (%rax,%rdi), %eax
 ; X64-NEXT:    andb $15, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-POPCNT-LABEL: cnt8:
--- a/test/CodeGen/X86/pr23664.ll
+++ b/test/CodeGen/X86/pr23664.ll
@ -4,8 +4,8 @@
 define i2 @f(i32 %arg) {
 ; CHECK-LABEL: f:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    addb %al, %al
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal (%rdi,%rdi), %eax
 ; CHECK-NEXT:    orb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
--- a/test/CodeGen/X86/rotate4.ll
+++ b/test/CodeGen/X86/rotate4.ll
@ -642,9 +642,9 @@ define i32 @rotate_demanded_bits_3(i32, i32) {
 ;
 ; X64-LABEL: rotate_demanded_bits_3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    leal (%rsi,%rsi), %ecx
 ; X64-NEXT:    andb $30, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    roll %cl, %eax
--- a/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/test/CodeGen/X86/scheduler-backtracking.ll
@ -19,29 +19,28 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    incl %esi
 ; ILP-NEXT:    addb %sil, %sil
 ; ILP-NEXT:    orb $1, %sil
-; ILP-NEXT:    movl $1, %r9d
+; ILP-NEXT:    movl $1, %r10d
 ; ILP-NEXT:    xorl %r14d, %r14d
 ; ILP-NEXT:    movl %esi, %ecx
-; ILP-NEXT:    shldq %cl, %r9, %r14
+; ILP-NEXT:    shldq %cl, %r10, %r14
 ; ILP-NEXT:    movl $1, %edx
 ; ILP-NEXT:    shlq %cl, %rdx
-; ILP-NEXT:    movl %esi, %r11d
-; ILP-NEXT:    addb $-128, %r11b
-; ILP-NEXT:    movb $-128, %r10b
+; ILP-NEXT:    leal -128(%rsi), %r9d
+; ILP-NEXT:    movb $-128, %r11b
 ; ILP-NEXT:    xorl %ebx, %ebx
-; ILP-NEXT:    movl %r11d, %ecx
-; ILP-NEXT:    shldq %cl, %r9, %rbx
+; ILP-NEXT:    movl %r9d, %ecx
+; ILP-NEXT:    shldq %cl, %r10, %rbx
 ; ILP-NEXT:    testb $64, %sil
 ; ILP-NEXT:    cmovneq %rdx, %r14
 ; ILP-NEXT:    cmovneq %r8, %rdx
 ; ILP-NEXT:    movl $1, %edi
 ; ILP-NEXT:    shlq %cl, %rdi
-; ILP-NEXT:    subb %sil, %r10b
-; ILP-NEXT:    movl %r10d, %ecx
-; ILP-NEXT:    shrdq %cl, %r8, %r9
-; ILP-NEXT:    testb $64, %r10b
-; ILP-NEXT:    cmovneq %r8, %r9
+; ILP-NEXT:    subb %sil, %r11b
+; ILP-NEXT:    movl %r11d, %ecx
+; ILP-NEXT:    shrdq %cl, %r8, %r10
 ; ILP-NEXT:    testb $64, %r11b
+; ILP-NEXT:    cmovneq %r8, %r10
+; ILP-NEXT:    testb $64, %r9b
 ; ILP-NEXT:    cmovneq %rdi, %rbx
 ; ILP-NEXT:    cmovneq %r8, %rdi
 ; ILP-NEXT:    testb %sil, %sil
@ -52,7 +51,7 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    cmovnsq %r8, %rbx
 ; ILP-NEXT:    cmoveq %r8, %rbx
 ; ILP-NEXT:    movq %rbx, 24(%rax)
-; ILP-NEXT:    cmovnsq %r9, %rdi
+; ILP-NEXT:    cmovnsq %r10, %rdi
 ; ILP-NEXT:    cmoveq %r8, %rdi
 ; ILP-NEXT:    movq %rdi, 16(%rax)
 ; ILP-NEXT:    popq %rbx
@ -76,7 +75,7 @@ define i256 @test1(i256 %a) nounwind {
 ; HYBRID-NEXT:    xorl %r10d, %r10d
 ; HYBRID-NEXT:    movl %esi, %ecx
 ; HYBRID-NEXT:    shldq %cl, %r11, %r10
-; HYBRID-NEXT:    addb $-128, %cl
+; HYBRID-NEXT:    leal -128(%rsi), %ecx
 ; HYBRID-NEXT:    xorl %edi, %edi
 ; HYBRID-NEXT:    shldq %cl, %r11, %rdi
 ; HYBRID-NEXT:    movl $1, %edx
@ -119,7 +118,7 @@ define i256 @test1(i256 %a) nounwind {
 ; BURR-NEXT:    xorl %r10d, %r10d
 ; BURR-NEXT:    movl %esi, %ecx
 ; BURR-NEXT:    shldq %cl, %r11, %r10
-; BURR-NEXT:    addb $-128, %cl
+; BURR-NEXT:    leal -128(%rsi), %ecx
 ; BURR-NEXT:    xorl %edi, %edi
 ; BURR-NEXT:    shldq %cl, %r11, %rdi
 ; BURR-NEXT:    movl $1, %edx
@ -160,8 +159,7 @@ define i256 @test1(i256 %a) nounwind {
 ; SRC-NEXT:    shrdq %cl, %r8, %r10
 ; SRC-NEXT:    testb $64, %cl
 ; SRC-NEXT:    cmovneq %r8, %r10
-; SRC-NEXT:    movl %esi, %r9d
-; SRC-NEXT:    addb $-128, %r9b
+; SRC-NEXT:    leal -128(%rsi), %r9d
 ; SRC-NEXT:    xorl %edx, %edx
 ; SRC-NEXT:    movl %r9d, %ecx
 ; SRC-NEXT:    shldq %cl, %rdi, %rdx
@ -215,13 +213,12 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    cmovneq %rdx, %rdi
 ; LIN-NEXT:    cmovsq %r9, %rdi
 ; LIN-NEXT:    movq %rdi, 8(%rax)
-; LIN-NEXT:    movl %esi, %edx
-; LIN-NEXT:    addb $-128, %dl
-; LIN-NEXT:    movl $1, %r10d
-; LIN-NEXT:    movl %edx, %ecx
-; LIN-NEXT:    shlq %cl, %r10
-; LIN-NEXT:    testb $64, %dl
-; LIN-NEXT:    movq %r10, %rdi
+; LIN-NEXT:    leal -128(%rsi), %r10d
+; LIN-NEXT:    movl $1, %edx
+; LIN-NEXT:    movl %r10d, %ecx
+; LIN-NEXT:    shlq %cl, %rdx
+; LIN-NEXT:    testb $64, %r10b
+; LIN-NEXT:    movq %rdx, %rdi
 ; LIN-NEXT:    cmovneq %r9, %rdi
 ; LIN-NEXT:    movb $-128, %cl
 ; LIN-NEXT:    subb %sil, %cl
@ -233,9 +230,9 @@ define i256 @test1(i256 %a) nounwind {
 ; LIN-NEXT:    cmoveq %r9, %rsi
 ; LIN-NEXT:    movq %rsi, 16(%rax)
 ; LIN-NEXT:    xorl %esi, %esi
-; LIN-NEXT:    movl %edx, %ecx
+; LIN-NEXT:    movl %r10d, %ecx
 ; LIN-NEXT:    shldq %cl, %r8, %rsi
-; LIN-NEXT:    cmovneq %r10, %rsi
+; LIN-NEXT:    cmovneq %rdx, %rsi
 ; LIN-NEXT:    cmovnsq %r9, %rsi
 ; LIN-NEXT:    cmoveq %r9, %rsi
 ; LIN-NEXT:    movq %rsi, 24(%rax)