Implement x86 h-register extract support.

- Add patterns for h-register extract, which avoids a shift and mask, and in some cases a temporary register. - Add address-mode matching for turning (X>>(8-n))&(255<<n), where n is a valid address-mode scale value, into an h-register extract and a scaled-offset address. - Replace X86's MOV32to32_ and related instructions with the new target-independent COPY_TO_SUBREG instruction. On x86-64 there are complicated constraints on h registers, and CodeGen doesn't currently provide a high-level way to express all of them, so they are handled with a bunch of special code. This code currently only supports extracts where the result is used by a zero-extend or a store, though these are fairly common. These transformations are not always beneficial; since there are only 4 h registers, they sometimes require extra move instructions, and this sometimes increases register pressure because it can force out values that would otherwise be in one of those registers. However, this appears to be relatively uncommon. llvm-svn: 68962
2024-11-23 19:23:23 +01:00 · 2009-04-13 16:09:41 +00:00 · 2009-04-13 16:09:41 +00:00 · be7227005f
commit be7227005f
parent 3873cb7a36
12 changed files with 540 additions and 100 deletions
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@ -997,7 +997,7 @@ bool X86FastISel::X86SelectTrunc(Instruction *I) {
    return false;

  // First issue a copy to GR16_ or GR32_.
-  unsigned CopyOpc = (SrcVT == MVT::i16) ? X86::MOV16to16_ : X86::MOV32to32_;
+  unsigned CopyOpc = (SrcVT == MVT::i16) ? X86::MOV16rr : X86::MOV32rr;
  const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
    ? X86::GR16_RegisterClass : X86::GR32_RegisterClass;
  unsigned CopyReg = createResultReg(CopyRC);
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -1019,21 +1019,69 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
    break;
      
  case ISD::AND: {
-    // Handle "(x << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this
-    // allows us to fold the shift into this addressing mode.
+    // Perform some heroic transforms on an and of a constant-count shift
+    // with a constant to enable use of the scaled offset field.
+
    SDValue Shift = N.getOperand(0);
-    if (Shift.getOpcode() != ISD::SHL) break;
+    if (Shift.getNumOperands() != 2) break;

    // Scale must not be used already.
    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;

    // Not when RIP is used as the base.
    if (AM.isRIPRel) break;
-      
+
+    SDValue X = Shift.getOperand(0);
    ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N.getOperand(1));
    ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
    if (!C1 || !C2) break;

+    // Handle "(X >> (8-C1)) & C2" as "(X >> 8) & 0xff)" if safe. This
+    // allows us to convert the shift and and into an h-register extract and
+    // a scaled index.
+    if (Shift.getOpcode() == ISD::SRL && Shift.hasOneUse()) {
+      unsigned ScaleLog = 8 - C1->getZExtValue();
+      if (ScaleLog > 0 && ScaleLog < 64 &&
+          C2->getZExtValue() == (UINT64_C(0xff) << ScaleLog)) {
+        SDValue Eight = CurDAG->getConstant(8, MVT::i8);
+        SDValue Mask = CurDAG->getConstant(0xff, N.getValueType());
+        SDValue Srl = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
+                                      X, Eight);
+        SDValue And = CurDAG->getNode(ISD::AND, dl, N.getValueType(),
+                                      Srl, Mask);
+
+        // Insert the new nodes into the topological ordering.
+        if (Eight.getNode()->getNodeId() == -1 ||
+            Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(X.getNode(), Eight.getNode());
+          Eight.getNode()->setNodeId(X.getNode()->getNodeId());
+        }
+        if (Mask.getNode()->getNodeId() == -1 ||
+            Mask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(X.getNode(), Mask.getNode());
+          Mask.getNode()->setNodeId(X.getNode()->getNodeId());
+        }
+        if (Srl.getNode()->getNodeId() == -1 ||
+            Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(Shift.getNode(), Srl.getNode());
+          Srl.getNode()->setNodeId(Shift.getNode()->getNodeId());
+        }
+        if (And.getNode()->getNodeId() == -1 ||
+            And.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(N.getNode(), And.getNode());
+          And.getNode()->setNodeId(N.getNode()->getNodeId());
+        }
+        CurDAG->ReplaceAllUsesWith(N, And);
+        AM.IndexReg = And;
+        AM.Scale = (1 << ScaleLog);
+        return false;
+      }
+    }
+
+    // Handle "(X << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this
+    // allows us to fold the shift into this addressing mode.
+    if (Shift.getOpcode() != ISD::SHL) break;
+
    // Not likely to be profitable if either the AND or SHIFT node has more
    // than one use (unless all uses are for address computation). Besides,
    // isel mechanism requires their node ids to be reused.
@ -1046,7 +1094,6 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM,
      break;
    
    // Get the new AND mask, this folds to a constant.
-    SDValue X = Shift.getOperand(0);
    SDValue NewANDMask = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
                                         SDValue(C2, 0), SDValue(C1, 0));
    SDValue NewAND = CurDAG->getNode(ISD::AND, dl, N.getValueType(), X, 
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@ -1522,7 +1522,7 @@ def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),

 // r & (2^32-1) ==> movz
 def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
-          (MOVZX64rr32 (i32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit)))>;
+          (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
 // r & (2^16-1) ==> movz
 def : Pat<(and GR64:$src, 0xffff),
          (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)))>;
@ -1531,7 +1531,7 @@ def : Pat<(and GR64:$src, 0xff),
          (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)))>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR32:$src1, 0xff),
-           (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR32:$src1, x86_subreg_8bit)))>,
+           (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, x86_subreg_8bit))>,
      Requires<[In64BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
@ -1540,13 +1540,13 @@ def : Pat<(and GR16:$src1, 0xff),

 // sext_inreg patterns
 def : Pat<(sext_inreg GR64:$src, i32),
-          (MOVSX64rr32 (i32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit)))>;
+          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
 def : Pat<(sext_inreg GR64:$src, i16),
-          (MOVSX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)))>;
+          (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit))>;
 def : Pat<(sext_inreg GR64:$src, i8),
-          (MOVSX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)))>;
+          (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit))>;
 def : Pat<(sext_inreg GR32:$src, i8),
-          (MOVSX32rr8 (i8 (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit)))>,
+          (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit))>,
      Requires<[In64BitMode]>;
 def : Pat<(sext_inreg GR16:$src, i8),
          (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit)))>,
@ -1554,16 +1554,63 @@ def : Pat<(sext_inreg GR16:$src, i8),

 // trunc patterns
 def : Pat<(i32 (trunc GR64:$src)),
-          (i32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit)>;
 def : Pat<(i16 (trunc GR64:$src)),
-          (i16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit))>;
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)>;
 def : Pat<(i8 (trunc GR64:$src)),
-          (i8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit))>;
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)>;
 def : Pat<(i8 (trunc GR32:$src)),
-          (i8 (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit))>,
+          (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit)>,
      Requires<[In64BitMode]>;
 def : Pat<(i8 (trunc GR16:$src)),
-          (i8 (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit))>,
+          (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit)>,
+      Requires<[In64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative and only the extract if the value is immediately
+// zero-extended or stored, which are somewhat common cases. This uses a bunch
+// of code to prevent a register requiring a REX prefix from being allocated in
+// the same instruction as the h register, as there's currently no way to
+// describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR64:$src, GR64_),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl_su GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_16bit)>,
+      Requires<[In64BitMode]>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR64:$src, GR64_),
+                            x86_subreg_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                            x86_subreg_8bit_hi))>,
      Requires<[In64BitMode]>;

 // (shl x, 1) ==> (add x, x)
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@ -258,10 +258,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
    { X86::JMP64r,      X86::JMP64m, 1 },
    { X86::MOV16ri,     X86::MOV16mi, 0 },
    { X86::MOV16rr,     X86::MOV16mr, 0 },
-    { X86::MOV16to16_,  X86::MOV16_mr, 0 },
    { X86::MOV32ri,     X86::MOV32mi, 0 },
    { X86::MOV32rr,     X86::MOV32mr, 0 },
-    { X86::MOV32to32_,  X86::MOV32_mr, 0 },
    { X86::MOV64ri32,   X86::MOV64mi32, 0 },
    { X86::MOV64rr,     X86::MOV64mr, 0 },
    { X86::MOV8ri,      X86::MOV8mi, 0 },
@ -372,9 +370,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm },
    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm },
    { X86::MOV16rr,         X86::MOV16rm },
-    { X86::MOV16to16_,      X86::MOV16_rm },
    { X86::MOV32rr,         X86::MOV32rm },
-    { X86::MOV32to32_,      X86::MOV32_rm },
    { X86::MOV64rr,         X86::MOV64rm },
    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm },
    { X86::MOV64toSDrr,     X86::MOV64toSDrm },
@ -404,6 +400,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm },
    { X86::MOVZX16rr8,      X86::MOVZX16rm8 },
    { X86::MOVZX32rr16,     X86::MOVZX32rm16 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8 },
    { X86::MOVZX32rr8,      X86::MOVZX32rm8 },
    { X86::MOVZX64rr16,     X86::MOVZX64rm16 },
    { X86::MOVZX64rr32,     X86::MOVZX64rm32 },
@ -672,8 +669,6 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
  case X86::MOV16rr:
  case X86::MOV32rr: 
  case X86::MOV64rr:
-  case X86::MOV16to16_:
-  case X86::MOV32to32_:
  case X86::MOVSSrr:
  case X86::MOVSDrr:

@ -710,9 +705,7 @@ unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
  default: break;
  case X86::MOV8rm:
  case X86::MOV16rm:
-  case X86::MOV16_rm:
  case X86::MOV32rm:
-  case X86::MOV32_rm:
  case X86::MOV64rm:
  case X86::LD_Fp64m:
  case X86::MOVSSrm:
@ -741,9 +734,7 @@ unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
  default: break;
  case X86::MOV8mr:
  case X86::MOV16mr:
-  case X86::MOV16_mr:
  case X86::MOV32mr:
-  case X86::MOV32_mr:
  case X86::MOV64mr:
  case X86::ST_FpP64m:
  case X86::MOVSSmr:
@ -795,9 +786,7 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI) const {
  default: break;
    case X86::MOV8rm:
    case X86::MOV16rm:
-    case X86::MOV16_rm:
    case X86::MOV32rm:
-    case X86::MOV32_rm:
    case X86::MOV64rm:
    case X86::LD_Fp64m:
    case X86::MOVSSrm:
@ -1670,10 +1659,22 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
      Opc = X86::MOV16rr;
    } else if (DestRC == &X86::GR8RegClass) {
      Opc = X86::MOV8rr;
+    } else if (DestRC == &X86::GR64_RegClass) {
+      Opc = X86::MOV64rr;
    } else if (DestRC == &X86::GR32_RegClass) {
-      Opc = X86::MOV32_rr;
+      Opc = X86::MOV32rr;
    } else if (DestRC == &X86::GR16_RegClass) {
-      Opc = X86::MOV16_rr;
+      Opc = X86::MOV16rr;
+    } else if (DestRC == &X86::GR8_RegClass) {
+      Opc = X86::MOV8rr;
+    } else if (DestRC == &X86::GR64_NOREXRegClass) {
+      Opc = X86::MOV64rr;
+    } else if (DestRC == &X86::GR32_NOREXRegClass) {
+      Opc = X86::MOV32rr;
+    } else if (DestRC == &X86::GR16_NOREXRegClass) {
+      Opc = X86::MOV16rr;
+    } else if (DestRC == &X86::GR8_NOREXRegClass) {
+      Opc = X86::MOV8rr;
    } else if (DestRC == &X86::RFP32RegClass) {
      Opc = X86::MOV_Fp3232;
    } else if (DestRC == &X86::RFP64RegClass || DestRC == &X86::RSTRegClass) {
@ -1721,7 +1722,7 @@ bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
      return true;
    }
  }
-  
+
  // Moving from ST(0) turns into FpGET_ST0_32 etc.
  if (SrcRC == &X86::RSTRegClass) {
    // Copying from ST(0)/ST(1).
@ -1779,10 +1780,22 @@ static unsigned getStoreRegOpcode(const TargetRegisterClass *RC,
    Opc = X86::MOV16mr;
  } else if (RC == &X86::GR8RegClass) {
    Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR64_RegClass) {
+    Opc = X86::MOV64mr;
  } else if (RC == &X86::GR32_RegClass) {
-    Opc = X86::MOV32_mr;
+    Opc = X86::MOV32mr;
  } else if (RC == &X86::GR16_RegClass) {
-    Opc = X86::MOV16_mr;
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8_RegClass) {
+    Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR64_NOREXRegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32_NOREXRegClass) {
+    Opc = X86::MOV32mr;
+  } else if (RC == &X86::GR16_NOREXRegClass) {
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8_NOREXRegClass) {
+    Opc = X86::MOV8mr;
  } else if (RC == &X86::RFP80RegClass) {
    Opc = X86::ST_FpP80m;   // pops
  } else if (RC == &X86::RFP64RegClass) {
@ -1847,10 +1860,22 @@ static unsigned getLoadRegOpcode(const TargetRegisterClass *RC,
    Opc = X86::MOV16rm;
  } else if (RC == &X86::GR8RegClass) {
    Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR64_RegClass) {
+    Opc = X86::MOV64rm;
  } else if (RC == &X86::GR32_RegClass) {
-    Opc = X86::MOV32_rm;
+    Opc = X86::MOV32rm;
  } else if (RC == &X86::GR16_RegClass) {
-    Opc = X86::MOV16_rm;
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8_RegClass) {
+    Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR64_NOREXRegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32_NOREXRegClass) {
+    Opc = X86::MOV32rm;
+  } else if (RC == &X86::GR16_NOREXRegClass) {
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8_NOREXRegClass) {
+    Opc = X86::MOV8rm;
  } else if (RC == &X86::RFP80RegClass) {
    Opc = X86::LD_Fp80m;
  } else if (RC == &X86::RFP64RegClass) {
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@ -181,6 +181,13 @@ def f64mem  : X86MemOperand<"printf64mem">;
 def f80mem  : X86MemOperand<"printf80mem">;
 def f128mem : X86MemOperand<"printf128mem">;

+// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
+// plain GR64, so that it doesn't potentially require a REX prefix.
+def i8mem_NOREX : Operand<i64> {
+  let PrintMethod = "printi8mem";
+  let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX, i32imm, i8imm);
+}
+
 def lea32mem : Operand<i32> {
  let PrintMethod = "printlea32mem";
  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
@ -398,6 +405,14 @@ def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
 def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
  return N->hasOneUse();
 }]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+  return N->hasOneUse();
+}]>;

 // 'shld' and 'shrd' instruction patterns. Note that even though these have
 // the srl and shl in their patterns, the C++ code must still check for them,
@ -767,7 +782,12 @@ def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
 def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                "mov{l}\t{$src, $dst|$dst, $src}",
                [(store GR32:$src, addr:$dst)]>;
-                
+
+// A version of MOV8mr that uses i8mem_NOREX so that it can be used for
+// storing h registers, which can't be encoded when a REX prefix is present.
+def MOV8mr_NOREX : I<0x88, MRMDestMem, (outs), (ins i8mem_NOREX:$dst, GR8:$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+
 //===----------------------------------------------------------------------===//
 //  Fixed-Register Multiplication and Division Instructions...
 //
@ -2899,6 +2919,18 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
                   [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB;

+// These are the same as the regular regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         []>, TB;
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         []>, TB;
+
 let neverHasSideEffects = 1 in {
  let Defs = [AX], Uses = [AL] in
  def CBW : I<0x98, RawFrm, (outs), (ins),
@ -2935,33 +2967,6 @@ def MOV32r0  : I<0x31, MRMInitReg,  (outs GR32:$dst), (ins),
                 [(set GR32:$dst, 0)]>;
 }

-// Basic operations on GR16 / GR32 subclasses GR16_ and GR32_ which contains only
-// those registers that have GR8 sub-registers (i.e. AX - DX, EAX - EDX).
-let neverHasSideEffects = 1, isAsCheapAsAMove = 1 in {
-def MOV16to16_ : I<0x89, MRMDestReg, (outs GR16_:$dst), (ins GR16:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
-def MOV32to32_ : I<0x89, MRMDestReg, (outs GR32_:$dst), (ins GR32:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
-                
-def MOV16_rr : I<0x89, MRMDestReg, (outs GR16_:$dst), (ins GR16_:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
-def MOV32_rr : I<0x89, MRMDestReg, (outs GR32_:$dst), (ins GR32_:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
-} // neverHasSideEffects
-
-let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
-def MOV16_rm : I<0x8B, MRMSrcMem, (outs GR16_:$dst), (ins i16mem:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
-def MOV32_rm : I<0x8B, MRMSrcMem, (outs GR32_:$dst), (ins i32mem:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
-}
-let mayStore = 1, neverHasSideEffects = 1 in {
-def MOV16_mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16_:$src),
-                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
-def MOV32_mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32_:$src),
-                "mov{l}\t{$src, $dst|$dst, $src}", []>;
-}
-
 //===----------------------------------------------------------------------===//
 // Thread Local Storage Instructions
 //
@ -3341,38 +3346,61 @@ def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),

 // r & (2^16-1) ==> movz
 def : Pat<(and GR32:$src1, 0xffff),
-          (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR32:$src1, x86_subreg_16bit)))>;
+          (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, x86_subreg_16bit))>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR32:$src1, 0xff),
-          (MOVZX32rr8 (i8 (EXTRACT_SUBREG (MOV32to32_ GR32:$src1),
-                                          x86_subreg_8bit)))>,
+          (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src1, GR32_),
+                                      x86_subreg_8bit))>,
      Requires<[In32BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
-          (MOVZX16rr8 (i8 (EXTRACT_SUBREG (MOV16to16_ GR16:$src1),
-                                          x86_subreg_8bit)))>,
+          (MOVZX16rr8 (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src1, GR16_),
+                                      x86_subreg_8bit))>,
      Requires<[In32BitMode]>;

 // sext_inreg patterns
 def : Pat<(sext_inreg GR32:$src, i16),
-          (MOVSX32rr16 (i16 (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit)))>;
+          (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit))>;
 def : Pat<(sext_inreg GR32:$src, i8),
-          (MOVSX32rr8 (i8 (EXTRACT_SUBREG (MOV32to32_ GR32:$src),
-                                          x86_subreg_8bit)))>,
+          (MOVSX32rr8 (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                                      x86_subreg_8bit))>,
      Requires<[In32BitMode]>;
 def : Pat<(sext_inreg GR16:$src, i8),
-          (MOVSX16rr8 (i8 (EXTRACT_SUBREG (MOV16to16_ GR16:$src),
-                                          x86_subreg_8bit)))>,
+          (MOVSX16rr8 (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                                      x86_subreg_8bit))>,
      Requires<[In32BitMode]>;

 // trunc patterns
 def : Pat<(i16 (trunc GR32:$src)),
-          (i16 (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit))>;
+          (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit)>;
 def : Pat<(i8 (trunc GR32:$src)),
-          (i8 (EXTRACT_SUBREG (MOV32to32_ GR32:$src), x86_subreg_8bit))>,
+          (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                          x86_subreg_8bit)>,
      Requires<[In32BitMode]>;
 def : Pat<(i8 (trunc GR16:$src)),
-          (i8 (EXTRACT_SUBREG (MOV16to16_ GR16:$src), x86_subreg_8bit))>,
+          (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                          x86_subreg_8bit)>,
+      Requires<[In32BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+          (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                          x86_subreg_8bit_hi)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+          (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                          x86_subreg_8bit_hi)>,
+      Requires<[In32BitMode]>;
+def : Pat<(srl_su GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32rr8
+              (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR16:$src, GR16_),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_16bit)>,
+      Requires<[In32BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (COPY_TO_SUBCLASS GR32:$src, GR32_),
+                                      x86_subreg_8bit_hi))>,
      Requires<[In32BitMode]>;

 // (shl x, 1) ==> (add x, x)
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@ -35,7 +35,7 @@ namespace X86 {
  /// these indices must be kept in sync with the class indices in the 
  /// X86RegisterInfo.td file.
  enum SubregIndex {
-    SUBREG_8BIT = 1, SUBREG_16BIT = 2, SUBREG_32BIT = 3
+    SUBREG_8BIT = 1, SUBREG_8BIT_HI = 2, SUBREG_16BIT = 3, SUBREG_32BIT = 4
  };
 }

--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@ -49,7 +49,8 @@ let Namespace = "X86" in {
  def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>;
  def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>;

-  // High registers X86-32 only
+  // High registers. On x86-64, these cannot be used in any instruction
+  // with a REX prefix.
  def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>;
  def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>;
  def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>;
@ -185,41 +186,45 @@ let Namespace = "X86" in {
 //

 def x86_subreg_8bit    : PatLeaf<(i32 1)>;
-def x86_subreg_16bit   : PatLeaf<(i32 2)>;
-def x86_subreg_32bit   : PatLeaf<(i32 3)>;
+def x86_subreg_8bit_hi : PatLeaf<(i32 2)>;
+def x86_subreg_16bit   : PatLeaf<(i32 3)>;
+def x86_subreg_32bit   : PatLeaf<(i32 4)>;

 def : SubRegSet<1, [AX, CX, DX, BX, SP,  BP,  SI,  DI,  
                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W],
                   [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
                    R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;

-// It's unclear if this subreg set is safe, given that not all registers
-// in the class have an 'H' subreg.
-// def : SubRegSet<2, [AX, CX, DX, BX],
-//                    [AH, CH, DH, BH]>;
+def : SubRegSet<2, [AX, CX, DX, BX],
+                   [AH, CH, DH, BH]>;

 def : SubRegSet<1, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,  
                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
                   [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
                    R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;

-def : SubRegSet<2, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,  
+def : SubRegSet<2, [EAX, ECX, EDX, EBX],
+                   [AH, CH, DH, BH]>;
+
+def : SubRegSet<3, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
                   [AX,  CX,  DX,  BX,  SP,  BP,  SI,  DI, 
                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;

-
 def : SubRegSet<1, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,  
                    R8,  R9,  R10, R11, R12, R13, R14, R15],
                   [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
                    R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;

-def : SubRegSet<2, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,  
+def : SubRegSet<2, [RAX, RCX, RDX, RBX],
+                   [AH, CH, DH, BH]>;
+
+def : SubRegSet<3, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
                    R8,  R9,  R10, R11, R12, R13, R14, R15],
                   [AX,  CX,  DX,  BX,  SP,  BP,  SI,  DI, 
                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
-                    
-def : SubRegSet<3, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,  
+
+def : SubRegSet<4, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
                    R8,  R9,  R10, R11, R12, R13, R14, R15],
                   [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI, 
                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]>;
@ -236,7 +241,11 @@ def : SubRegSet<3, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
 // R8B, ... R15B. 
 // Allocate R12 and R13 last, as these require an extra byte when
 // encoded in x86_64 instructions.
-// FIXME: Allow AH, CH, DH, BH in 64-mode for non-REX instructions,
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
 def GR8 : RegisterClass<"X86", [i8],  8,
                        [AL, CL, DL, BL, AH, CH, DH, BH, SIL, DIL, BPL, SPL,
                         R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B]> {
@ -295,7 +304,7 @@ def GR8 : RegisterClass<"X86", [i8],  8,
 def GR16 : RegisterClass<"X86", [i16], 16,
                         [AX, CX, DX, SI, DI, BX, BP, SP,
                          R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> {
-  let SubRegClassList = [GR8];
+  let SubRegClassList = [GR8, GR8];
  let MethodProtos = [{
    iterator allocation_order_begin(const MachineFunction &MF) const;
    iterator allocation_order_end(const MachineFunction &MF) const;
@ -363,7 +372,7 @@ def GR16 : RegisterClass<"X86", [i16], 16,
 def GR32 : RegisterClass<"X86", [i32], 32, 
                         [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
                          R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
-  let SubRegClassList = [GR8, GR16];
+  let SubRegClassList = [GR8, GR8, GR16];
  let MethodProtos = [{
    iterator allocation_order_begin(const MachineFunction &MF) const;
    iterator allocation_order_end(const MachineFunction &MF) const;
@ -431,7 +440,7 @@ def GR32 : RegisterClass<"X86", [i32], 32,
 def GR64 : RegisterClass<"X86", [i64], 64, 
                         [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
                          RBX, R14, R15, R12, R13, RBP, RSP]> {
-  let SubRegClassList = [GR8, GR16, GR32];
+  let SubRegClassList = [GR8, GR8, GR16, GR32];
  let MethodProtos = [{
    iterator allocation_order_end(const MachineFunction &MF) const;
  }];
@ -452,13 +461,118 @@ def GR64 : RegisterClass<"X86", [i64], 64,
 }


-// GR16, GR32 subclasses which contain registers that have GR8 sub-registers.
-// These should only be used for 32-bit mode.
+// GR8_, GR16_, GR32_, GR64_ - Subclasses of GR8, GR16, GR32, and GR64
+// which contain just the "a" "b", "c", and "d" registers. On x86-32,
+// GR16_ and GR32_ are classes for registers that support 8-bit subreg
+// operations. On x86-64, GR16_, GR32_, and GR64_ are classes for registers
+// that support 8-bit h-register operations.
+def GR8_ : RegisterClass<"X86", [i8], 8, [AL, CL, DL, BL]> {
+}
 def GR16_ : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]> {
-  let SubRegClassList = [GR8];
+  let SubRegClassList = [GR8_, GR8_];
 }
 def GR32_ : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]> {
-  let SubRegClassList = [GR8, GR16];
+  let SubRegClassList = [GR8_, GR8_, GR16_];
+}
+def GR64_ : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RBX]> {
+  let SubRegClassList = [GR8_, GR8_, GR16_, GR32_];
+}
+
+// GR8_NOREX, GR16_NOREX, GR32_NOREX, GR64_NOREX - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain only the first 8 GPRs.
+// On x86-64, GR64_NOREX, GR32_NOREX and GR16_NOREX are the classes
+// of registers which do not by themselves require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+                              [AL, CL, DL, SIL, DIL, BL, BPL, SPL]> {
+}
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+                               [AX, CX, DX, SI, DI, BX, BP, SP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX];
+}
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+                               [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate ESP or EBP.
+    static const unsigned X86_GR32_NOREX_AO_fp[] = {
+      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX
+    };
+    // If not, just don't allocate ESP.
+    static const unsigned X86_GR32_NOREX_AO[] = {
+      X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP
+    };
+
+    GR32_NOREXClass::iterator
+    GR32_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR32_NOREX_AO_fp;
+      else
+        return X86_GR32_NOREX_AO;
+    }
+
+    GR32_NOREXClass::iterator
+    GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR32_NOREX_AO_fp +
+               (sizeof(X86_GR32_NOREX_AO_fp) / sizeof(unsigned));
+      else
+        return X86_GR32_NOREX_AO +
+               (sizeof(X86_GR32_NOREX_AO) / sizeof(unsigned));
+    }
+  }];
+}
+
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+                               [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX, GR32_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // Does the function dedicate RBP / EBP to being a frame ptr?
+    // If so, don't allocate RSP or RBP.
+    static const unsigned X86_GR64_NOREX_AO_fp[] = {
+      X86::RAX, X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBX
+    };
+    // If not, just don't allocate RSP.
+    static const unsigned X86_GR64_NOREX_AO[] = {
+      X86::RAX, X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBX, X86::RBP
+    };
+
+    GR64_NOREXClass::iterator
+    GR64_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR64_NOREX_AO_fp;
+      else
+        return X86_GR64_NOREX_AO;
+    }
+
+    GR64_NOREXClass::iterator
+    GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return X86_GR64_NOREX_AO_fp +
+               (sizeof(X86_GR64_NOREX_AO_fp) / sizeof(unsigned));
+      else
+        return X86_GR64_NOREX_AO +
+               (sizeof(X86_GR64_NOREX_AO) / sizeof(unsigned));
+    }
+  }];
 }

 // A class to support the 'A' assembler constraint: EAX then EDX.
--- a/test/CodeGen/X86/h-register-addressing-32.ll
+++ b/test/CodeGen/X86/h-register-addressing-32.ll
@ -0,0 +1,53 @@
+; RUN: llvm-as < %s | llc -march=x86 | grep {movzbl	%\[abcd\]h,} | count 7
+
+; Use h-register extract and zero-extend.
+
+define double @foo8(double* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 8
+  %t1 = and i32 %t0, 255
+  %t2 = getelementptr double* %p, i32 %t1
+  %t3 = load double* %t2, align 8
+  ret double %t3
+}
+define float @foo4(float* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 8
+  %t1 = and i32 %t0, 255
+  %t2 = getelementptr float* %p, i32 %t1
+  %t3 = load float* %t2, align 8
+  ret float %t3
+}
+define i16 @foo2(i16* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 8
+  %t1 = and i32 %t0, 255
+  %t2 = getelementptr i16* %p, i32 %t1
+  %t3 = load i16* %t2, align 8
+  ret i16 %t3
+}
+define i8 @foo1(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 8
+  %t1 = and i32 %t0, 255
+  %t2 = getelementptr i8* %p, i32 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar8(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 5
+  %t1 = and i32 %t0, 2040
+  %t2 = getelementptr i8* %p, i32 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar4(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 6
+  %t1 = and i32 %t0, 1020
+  %t2 = getelementptr i8* %p, i32 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar2(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
+  %t0 = lshr i32 %x, 7
+  %t1 = and i32 %t0, 510
+  %t2 = getelementptr i8* %p, i32 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
--- a/test/CodeGen/X86/h-register-addressing-64.ll
+++ b/test/CodeGen/X86/h-register-addressing-64.ll
@ -0,0 +1,53 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | grep {movzbl	%\[abcd\]h,} | count 7
+
+; Use h-register extract and zero-extend.
+
+define double @foo8(double* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 8
+  %t1 = and i64 %t0, 255
+  %t2 = getelementptr double* %p, i64 %t1
+  %t3 = load double* %t2, align 8
+  ret double %t3
+}
+define float @foo4(float* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 8
+  %t1 = and i64 %t0, 255
+  %t2 = getelementptr float* %p, i64 %t1
+  %t3 = load float* %t2, align 8
+  ret float %t3
+}
+define i16 @foo2(i16* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 8
+  %t1 = and i64 %t0, 255
+  %t2 = getelementptr i16* %p, i64 %t1
+  %t3 = load i16* %t2, align 8
+  ret i16 %t3
+}
+define i8 @foo1(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 8
+  %t1 = and i64 %t0, 255
+  %t2 = getelementptr i8* %p, i64 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar8(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 5
+  %t1 = and i64 %t0, 2040
+  %t2 = getelementptr i8* %p, i64 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar4(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 6
+  %t1 = and i64 %t0, 1020
+  %t2 = getelementptr i8* %p, i64 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
+define i8 @bar2(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
+  %t0 = lshr i64 %x, 7
+  %t1 = and i64 %t0, 510
+  %t2 = getelementptr i8* %p, i64 %t1
+  %t3 = load i8* %t2, align 8
+  ret i8 %t3
+}
--- a/test/CodeGen/X86/h-register-store.ll
+++ b/test/CodeGen/X86/h-register-store.ll
@ -0,0 +1,27 @@
+; RUN: llvm-as < %s | llc -march=x86-64 > %t
+; RUN: grep mov %t | count 6
+; RUN: grep {movb	%ah, (%rsi)} %t | count 3
+; RUN: llvm-as < %s | llc -march=x86 > %t
+; RUN: grep mov %t | count 3
+; RUN: grep {movb	%ah, (%e} %t | count 3
+
+; Use h-register extract and store.
+
+define void @foo16(i16 inreg %p, i8* inreg %z) nounwind {
+  %q = lshr i16 %p, 8
+  %t = trunc i16 %q to i8
+  store i8 %t, i8* %z
+  ret void
+}
+define void @foo32(i32 inreg %p, i8* inreg %z) nounwind {
+  %q = lshr i32 %p, 8
+  %t = trunc i32 %q to i8
+  store i8 %t, i8* %z
+  ret void
+}
+define void @foo64(i64 inreg %p, i8* inreg %z) nounwind {
+  %q = lshr i64 %p, 8
+  %t = trunc i64 %q to i8
+  store i8 %t, i8* %z
+  ret void
+}
--- a/test/CodeGen/X86/h-registers.ll
+++ b/test/CodeGen/X86/h-registers.ll
@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | grep {movzbl	%\[abcd\]h,} | count 4
+; RUN: llvm-as < %s | llc -march=x86    > %t
+; RUN: grep {incb	%ah} %t | count 3
+; RUN: grep {movzbl	%ah,} %t | count 3
+
+; Use h registers. On x86-64, codegen doesn't support general allocation
+; of h registers yet, due to x86 encoding complications.
+
+define void @bar64(i64 inreg %x, i8* inreg %p) nounwind {
+  %t0 = lshr i64 %x, 8
+  %t1 = trunc i64 %t0 to i8
+  %t2 = add i8 %t1, 1
+  store i8 %t2, i8* %p
+  ret void
+}
+
+define void @bar32(i32 inreg %x, i8* inreg %p) nounwind {
+  %t0 = lshr i32 %x, 8
+  %t1 = trunc i32 %t0 to i8
+  %t2 = add i8 %t1, 1
+  store i8 %t2, i8* %p
+  ret void
+}
+
+define void @bar16(i16 inreg %x, i8* inreg %p) nounwind {
+  %t0 = lshr i16 %x, 8
+  %t1 = trunc i16 %t0 to i8
+  %t2 = add i8 %t1, 1
+  store i8 %t2, i8* %p
+  ret void
+}
+
+define i64 @qux64(i64 inreg %x) nounwind {
+  %t0 = lshr i64 %x, 8
+  %t1 = and i64 %t0, 255
+  ret i64 %t1
+}
+
+define i32 @qux32(i32 inreg %x) nounwind {
+  %t0 = lshr i32 %x, 8
+  %t1 = and i32 %t0, 255
+  ret i32 %t1
+}
+
+define i16 @qux16(i16 inreg %x) nounwind {
+  %t0 = lshr i16 %x, 8
+  ret i16 %t0
+}
--- a/test/CodeGen/X86/inline-asm-out-regs.ll
+++ b/test/CodeGen/X86/inline-asm-out-regs.ll
@ -1,6 +1,4 @@
 ; RUN: llvm-as < %s | llc -mtriple=i386-unknown-linux-gnu
-; XFAIL: *
-; Expected to run out of registers during allocation.
 ; PR3391

@pci_indirect = external global { }             ; <{ }*> [#uses=1]