[AMDGPU] gfx1010 core wave32 changes

Differential Revision: https://reviews.llvm.org/D63204 llvm-svn: 363934
2025-01-31 20:51:52 +01:00 · 2019-06-20 15:08:34 +00:00 · 2019-06-20 15:08:34 +00:00 · 2653a95667
commit 2653a95667
parent 222088eed6
32 changed files with 1933 additions and 60 deletions
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@ -777,7 +777,7 @@ def FeatureISAVersion10_1_0 : FeatureSet<
     FeatureLDSBankCount32,
     FeatureDLInsts,
     FeatureNSAEncoding,
-     FeatureWavefrontSize64,
+     FeatureWavefrontSize32,
     FeatureScalarStores,
     FeatureScalarAtomics,
     FeatureScalarFlatScratchInsts,
@ -795,7 +795,7 @@ def FeatureISAVersion10_1_1 : FeatureSet<
     FeatureDot5Insts,
     FeatureDot6Insts,
     FeatureNSAEncoding,
-     FeatureWavefrontSize64,
+     FeatureWavefrontSize32,
     FeatureScalarStores,
     FeatureScalarAtomics,
     FeatureScalarFlatScratchInsts,
@ -812,7 +812,7 @@ def FeatureISAVersion10_1_2 : FeatureSet<
     FeatureDot5Insts,
     FeatureDot6Insts,
     FeatureNSAEncoding,
-     FeatureWavefrontSize64,
+     FeatureWavefrontSize32,
     FeatureScalarStores,
     FeatureScalarAtomics,
     FeatureScalarFlatScratchInsts,
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@ -50,19 +50,19 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
 def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

 def AMDGPUIfOp : SDTypeProfile<1, 2,
-  [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
+  [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
 >;

 def AMDGPUElseOp : SDTypeProfile<1, 2,
-  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>]
+  [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
 >;

 def AMDGPULoopOp : SDTypeProfile<0, 2,
-  [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
+  [SDTCisVT<0, i1>, SDTCisVT<1, OtherVT>]
 >;

 def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
-  [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
+  [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>]
 >;

 //===----------------------------------------------------------------------===//
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@ -101,6 +101,12 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
  return addOperand(Inst, MCOperand::createImm(Imm));
 }

+static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val,
+                                  uint64_t Addr, const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeBoolReg(Val));
+}
+
 #define DECODE_OPERAND(StaticDecoderName, DecoderName) \
 static DecodeStatus StaticDecoderName(MCInst &Inst, \
                                       unsigned Imm, \
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@ -946,6 +946,15 @@ public:
  /// not exist. If Opcode is not a pseudo instruction, this is identity.
  int pseudoToMCOpcode(int Opcode) const;

+  const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
+                                         const TargetRegisterInfo *TRI,
+                                         const MachineFunction &MF)
+    const override {
+    if (OpNum >= TID.getNumOperands())
+      return nullptr;
+    return RI.getRegClass(TID.OpInfo[OpNum].RegClass);
+  }
+
  void fixImplicitOperands(MachineInstr &MI) const;
 };

--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@ -766,6 +766,15 @@ def VOPDstS64orS32 : BoolRC {
  let PrintMethod = "printVOPDst";
 }

+// SCSrc_i1 is the operand for pseudo instructions only.
+// Boolean immeadiates shall not be exposed to codegen instructions.
+def SCSrc_i1 : RegisterOperand<SReg_1_XEXEC> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_IMM_INT32";
+  let ParserMatchClass = BoolReg;
+  let DecoderMethod = "decodeBoolReg";
+}
+
 // ===----------------------------------------------------------------------===//
 // ExpSrc* Special cases for exp src operands which are printed as
 // "off" depending on en operand.
@ -804,11 +813,12 @@ def SDWASrc_i16 : SDWASrc<i16>;
 def SDWASrc_f32 : SDWASrc<f32>;
 def SDWASrc_f16 : SDWASrc<f16>;

-def SDWAVopcDst : VOPDstOperand<SReg_64> {
+def SDWAVopcDst : BoolRC {
  let OperandNamespace = "AMDGPU";
  let OperandType = "OPERAND_SDWA_VOPC_DST";
  let EncoderMethod = "getSDWAVopcDstEncoding";
  let DecoderMethod = "decodeSDWAVopcDst";
+  let PrintMethod = "printVOPDst";
 }

 class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
@ -940,11 +950,6 @@ def f32kimm : kimmOperand<i32>;
 def KImmFP16MatchClass : KImmMatchClass<16>;
 def f16kimm : kimmOperand<i16>;

-
-def VOPDstS64 : VOPDstOperand <SReg_64> {
-  let PrintMethod = "printVOPDst";
-}
-
 class FPInputModsMatchClass <int opSize> : AsmOperandClass {
  let Name = "RegOrImmWithFP"#opSize#"InputMods";
  let ParserMethod = "parseRegOrImmWithFPInputMods";
@ -1237,7 +1242,7 @@ class getVALUDstForVT<ValueType VT> {
                          !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
                            !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
                              !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
-                              VOPDstOperand<SReg_64>)))); // else VT == i1
+                              VOPDstS64orS32)))); // else VT == i1
 }

 // Returns the register class to use for the destination of VOP[12C]
@ -1313,7 +1318,7 @@ class getVOP3SrcForVT<ValueType VT> {
           VSrc_f64,
           VSrc_b64),
        !if(!eq(VT.Value, i1.Value),
-           SCSrc_i1,
+           SSrc_i1,
           !if(isFP,
              !if(!eq(VT.Value, f16.Value),
                 VSrc_f16,
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -121,14 +121,14 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;

 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]

-def ENTER_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins i64imm:$src0)> {
+def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
  let Defs = [EXEC];
  let hasSideEffects = 0;
  let mayLoad = 0;
  let mayStore = 0;
 }

-def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
+def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
  let hasSideEffects = 0;
  let mayLoad = 0;
  let mayStore = 0;
@ -161,11 +161,11 @@ def S_SUB_U64_PSEUDO : SPseudoInstSI <
 >;

 def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
-  (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+  (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
 >;

 def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
-  (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
+  (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
 >;
 } // End usesCustomInserter = 1, Defs = [SCC]

@ -233,30 +233,30 @@ let isTerminator = 1 in {
 let OtherPredicates = [EnableLateCFGStructurize] in {
 def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
  (outs),
-  (ins SReg_64:$vcc, brtarget:$target),
+  (ins SReg_1:$vcc, brtarget:$target),
  [(brcond i1:$vcc, bb:$target)]> {
    let Size = 12;
 }
 }

 def SI_IF: CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
-  [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
+  (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target),
+  [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
  let Constraints = "";
  let Size = 12;
  let hasSideEffects = 1;
 }

 def SI_ELSE : CFPseudoInstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+  (outs SReg_1:$dst),
+  (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
  let Size = 12;
  let hasSideEffects = 1;
 }

 def SI_LOOP : CFPseudoInstSI <
-  (outs), (ins SReg_64:$saved, brtarget:$target),
-  [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
+  (outs), (ins SReg_1:$saved, brtarget:$target),
+  [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> {
  let Size = 8;
  let isBranch = 1;
  let hasSideEffects = 1;
@ -265,8 +265,7 @@ def SI_LOOP : CFPseudoInstSI <
 } // End isTerminator = 1

 def SI_END_CF : CFPseudoInstSI <
-  (outs), (ins SReg_64:$saved),
-  [(int_amdgcn_end_cf i64:$saved)], 1, 1> {
+  (outs), (ins SReg_1:$saved), [], 1, 1> {
  let Size = 4;
  let isAsCheapAsAMove = 1;
  let isReMaterializable = 1;
@ -276,8 +275,7 @@ def SI_END_CF : CFPseudoInstSI <
 }

 def SI_IF_BREAK : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
-  [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
+  (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
  let Size = 4;
  let isAsCheapAsAMove = 1;
  let isReMaterializable = 1;
@ -303,7 +301,7 @@ multiclass PseudoInstKill <dag ins> {
  }
 }

-defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
+defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
 defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;

 let Defs = [EXEC,VCC] in
@ -322,7 +320,7 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
 }

 def SI_PS_LIVE : PseudoInstSI <
-  (outs SReg_64:$dst), (ins),
+  (outs SReg_1:$dst), (ins),
  [(set i1:$dst, (int_amdgcn_ps_live))]> {
  let SALU = 1;
 }
@ -584,7 +582,7 @@ def : GCNPat<
 >;

 def : GCNPat<
-  (AMDGPUelse i64:$src, bb:$target),
+  (AMDGPUelse i1:$src, bb:$target),
  (SI_ELSE $src, $target, 0)
 >;

--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@ -733,8 +733,6 @@ def SSrcOrLds_b32 : RegisterOperand<SRegOrLds_32> {

 defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;

-def SCSrc_i1 : RegisterOperand<SReg_64_XEXEC>;
-
 //===----------------------------------------------------------------------===//
 //  VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
 //===----------------------------------------------------------------------===//
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@ -344,7 +344,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp
  let AsmDPP8 = "$vdst, vcc, $src0, $src1 $dpp8$fi";
  let AsmDPP16 = AsmDPP#"$fi";
  let Outs32 = (outs DstRC:$vdst);
-  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+  let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
 }

 // Write out to vcc or arbitrary SGPR and read in from vcc or
@ -358,7 +358,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*
  let AsmDPP8 = "$vdst, vcc, $src0, $src1, vcc $dpp8$fi";
  let AsmDPP16 = AsmDPP#"$fi";
  let Outs32 = (outs DstRC:$vdst);
-  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+  let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);

  // Suppress src2 implied by type since the 32-bit encoding uses an
  // implicit VCC use.
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@ -183,7 +183,7 @@ class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
  let HasModifiers = 0;
  let HasClamp = 0;
  let HasOMod = 0;
-  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+  let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
  let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
 }

@ -203,7 +203,7 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
  // FIXME: Hack to stop printing _e64
  let DstRC = RegisterOperand<VReg_64>;

-  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+  let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
  let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp";
 }

--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@ -56,7 +56,7 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt
  let Asm32 = "$src0, $src1";
  // The destination for 32-bit encoding is implicit.
  let HasDst32 = 0;
-  let Outs64 = (outs VOPDstS64:$sdst);
+  let Outs64 = (outs VOPDstS64orS32:$sdst);
  list<SchedReadWrite> Schedule = sched;
 }

--- a/test/CodeGen/AMDGPU/add3.ll
+++ b/test/CodeGen/AMDGPU/add3.ll
@ -22,6 +22,7 @@ define amdgpu_ps float @add3(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: add3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add3_u32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %a, %b
  %result = add i32 %x, %c
@ -46,6 +47,7 @@ define amdgpu_ps float @mad_no_add3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 ; GFX10-LABEL: mad_no_add3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v1, v4
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mad_u32_u24 v0, v2, v3, v0
 ; GFX10-NEXT:    ; return to shader part epilog
  %a0 = shl i32 %a, 8
@ -85,6 +87,7 @@ define amdgpu_ps float @add3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
 ; GFX10-LABEL: add3_vgpr_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add3_u32 v0, s3, s2, v0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %a, %b
  %result = add i32 %x, %c
@ -107,6 +110,7 @@ define amdgpu_ps float @add3_vgpr_all2(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: add3_vgpr_all2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add3_u32 v0, v1, v2, v0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %b, %c
  %result = add i32 %a, %x
@ -129,6 +133,7 @@ define amdgpu_ps float @add3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: add3_vgpr_bc:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add3_u32 v0, s2, v0, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %a, %b
  %result = add i32 %x, %c
@ -151,6 +156,7 @@ define amdgpu_ps float @add3_vgpr_const(i32 %a, i32 %b) {
 ; GFX10-LABEL: add3_vgpr_const:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add3_u32 v0, v0, v1, 16
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %a, %b
  %result = add i32 %x, 16
@ -175,6 +181,7 @@ define amdgpu_ps <2 x float> @add3_multiuse_outer(i32 %a, i32 %b, i32 %c, i32 %x
 ; GFX10-LABEL: add3_multiuse_outer:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add3_u32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v0, v3
 ; GFX10-NEXT:    ; return to shader part epilog
  %inner = add i32 %a, %b
@ -202,6 +209,7 @@ define amdgpu_ps <2 x float> @add3_multiuse_inner(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: add3_multiuse_inner:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v0, v2
 ; GFX10-NEXT:    ; return to shader part epilog
  %inner = add i32 %a, %b
@ -240,6 +248,7 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float
 ; GFX10-NEXT:    v_add_f32_e64 v1, s3, 2.0
 ; GFX10-NEXT:    v_add_f32_e64 v2, s2, 1.0
 ; GFX10-NEXT:    v_add_f32_e64 v0, 0x40400000, s4
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GFX10-NEXT:    ; return to shader part epilog
--- a/test/CodeGen/AMDGPU/add_i1.ll
+++ b/test/CodeGen/AMDGPU/add_i1.ll
@ -1,8 +1,10 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s


 ; GCN-LABEL: {{^}}add_var_var_i1:
-; GCN: s_xor_b64
+; GFX9:  s_xor_b64
+; GFX10: s_xor_b32
 define amdgpu_kernel void @add_var_var_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
  %a = load volatile i1, i1 addrspace(1)* %in0
  %b = load volatile i1, i1 addrspace(1)* %in1
@ -12,7 +14,8 @@ define amdgpu_kernel void @add_var_var_i1(i1 addrspace(1)* %out, i1 addrspace(1)
 }

 ; GCN-LABEL: {{^}}add_var_imm_i1:
-; GCN: s_not_b64
+; GFX9:  s_not_b64
+; GFX10: s_not_b32
 define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {
  %a = load volatile i1, i1 addrspace(1)* %in
  %add = add i1 %a, 1
@ -22,7 +25,8 @@ define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)

 ; GCN-LABEL: {{^}}add_i1_cf:
 ; GCN: ; %endif
-; GCN: s_not_b64
+; GFX9: s_not_b64
+; GFX10: s_not_b32
 define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
 entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
--- a/test/CodeGen/AMDGPU/add_shl.ll
+++ b/test/CodeGen/AMDGPU/add_shl.ll
@ -22,6 +22,7 @@ define amdgpu_ps float @add_shl(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: add_shl:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_lshl_u32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %a, %b
  %result = shl i32 %x, %c
@ -45,6 +46,7 @@ define amdgpu_ps float @add_shl_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) {
 ; GFX10-LABEL: add_shl_vgpr_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_lshl_u32 v0, s2, s3, v0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %a, %b
  %result = shl i32 %x, %c
@ -67,6 +69,7 @@ define amdgpu_ps float @add_shl_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) {
 ; GFX10-LABEL: add_shl_vgpr_ac:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_lshl_u32 v0, v0, s2, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %a, %b
  %result = shl i32 %x, %c
@ -89,6 +92,7 @@ define amdgpu_ps float @add_shl_vgpr_const(i32 %a, i32 %b) {
 ; GFX10-LABEL: add_shl_vgpr_const:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_lshl_u32 v0, v0, v1, 9
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %a, %b
  %result = shl i32 %x, 9
@ -112,6 +116,7 @@ define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) {
 ; GFX10-LABEL: add_shl_vgpr_const_inline_const:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 9, 0x7e800
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %a, 1012
  %result = shl i32 %x, 9
@ -138,6 +143,7 @@ define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) {
 ; GFX10-LABEL: add_shl_vgpr_inline_const_x2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 9, 0x600
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = add i32 %a, 3
  %result = shl i32 %x, 9
--- a/test/CodeGen/AMDGPU/and_or.ll
+++ b/test/CodeGen/AMDGPU/and_or.ll
@ -22,6 +22,7 @@ define amdgpu_ps float @and_or(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: and_or:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = and i32 %a, %b
  %result = or i32 %x, %c
@ -46,6 +47,7 @@ define amdgpu_ps float @and_or_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
 ; GFX10-LABEL: and_or_vgpr_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_and_or_b32 v0, s2, v0, s3
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = and i32 %a, %b
  %result = or i32 %x, %c
@ -68,6 +70,7 @@ define amdgpu_ps float @and_or_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) {
 ; GFX10-LABEL: and_or_vgpr_ab:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, v1, s2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = and i32 %a, %b
  %result = or i32 %x, %c
@ -90,6 +93,7 @@ define amdgpu_ps float @and_or_vgpr_const(i32 %a, i32 %b) {
 ; GFX10-LABEL: and_or_vgpr_const:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, 4, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = and i32 4, %a
  %result = or i32 %x, %b
@ -113,6 +117,7 @@ define amdgpu_ps float @and_or_vgpr_const_inline_const(i32 %a) {
 ; GFX10-LABEL: and_or_vgpr_const_inline_const:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, 20, 0x808
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = and i32 20, %a
  %result = or i32 %x, 2056
@ -135,6 +140,7 @@ define amdgpu_ps float @and_or_vgpr_inline_const_x2(i32 %a) {
 ; GFX10-LABEL: and_or_vgpr_inline_const_x2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, 4, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = and i32 4, %a
  %result = or i32 %x, 1
--- a/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@ -1,7 +1,7 @@
 ; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=WAVE64 --check-prefix=NOTES %s
 ; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=WAVE64 --check-prefix=NOTES %s
 ; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=WAVE64 --check-prefix=NOTES %s
-; run: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s
+; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s

@var = addrspace(1) global float 0.0

--- a/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/test/CodeGen/AMDGPU/huge-private-buffer.ll
@ -1,9 +1,23 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,WAVE64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,WAVE32 %s
+
+; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo14:
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
+; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ffc, [[FI]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
+define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 {
+  %alloca = alloca i32, align 4, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %alloca
+  %toint = ptrtoint i32 addrspace(5)* %alloca to i32
+  %masked = and i32 %toint, 16383
+  store volatile i32 %masked, i32 addrspace(1)* undef
+  ret void
+}

 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]]
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
 define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 {
  %alloca = alloca i32, align 4, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca
@ -15,8 +29,11 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 {

 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
-; GCN-NOT: [[FI]]
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
+; WAVE64-NOT: [[FI]]
+; WAVE64: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
+
+; WAVE32: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1fffc, [[FI]]
+; WAVE32: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
 define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 {
  %alloca = alloca i32, align 4, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca
@ -29,7 +46,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 {
 ; GCN-LABEL: {{^}}scratch_buffer_known_high_mask18:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
 ; GCN-NOT: [[FI]]
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
 define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 {
  %alloca = alloca i32, align 4, addrspace(5)
  store volatile i32 0, i32 addrspace(5)* %alloca
--- a/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir
+++ b/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir
@ -1,4 +1,5 @@
 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=W32 %s

 ---
 # GCN-LABEL: name: and_execz_mov_vccz
@ -318,3 +319,22 @@ body:             |
    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
    S_ENDPGM 0, implicit $scc
 ...
+---
+# W32-LABEL: name: and_execz_mov_vccz_w32
+# W32-NOT: S_MOV_
+# W32-NOT: S_AND_
+# W32: S_CBRANCH_EXECZ %bb.1, implicit $exec
+name:            and_execz_mov_vccz_w32
+body:             |
+  bb.0:
+    S_NOP 0
+
+  bb.1:
+    S_NOP 0
+
+  bb.2:
+    $sgpr0 = S_MOV_B32 -1
+    $vcc_lo = S_AND_B32 $exec_lo, killed $sgpr0, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_ENDPGM 0
+...
--- a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
+++ b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@ -1,5 +1,6 @@
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
-; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=SI,SICI,ALL %s
+; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=CI,SICI,ALL %s
+; RUN: opt -S -mcpu=gfx1010 -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10,ALL %s

 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
@ -46,7 +47,8 @@ entry:
  ret void
 }

-; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
+; SICI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
+; GFX10: alloca [5 x i32]

 define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
 entry:
@ -141,7 +143,9 @@ entry:
 }

 ; ALL-LABEL: @occupancy_6_over(
-; ALL: alloca [43 x i8]
+; SICI: alloca [43 x i8]
+; GFX10-NOT: alloca
+
 define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 entry:
  %stack = alloca [43 x i8], align 4
@ -191,7 +195,9 @@ entry:
 }

 ; ALL-LABEL: @occupancy_8_over(
-; ALL: alloca [33 x i8]
+; SICI: alloca [33 x i8]
+; GFX10-NOT: alloca
+
 define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 entry:
  %stack = alloca [33 x i8], align 4
@ -241,7 +247,9 @@ entry:
 }

 ; ALL-LABEL: @occupancy_9_over(
-; ALL: alloca [29 x i8]
+; SICI: alloca [29 x i8]
+; GFX10-NOT: alloca
+
 define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 entry:
  %stack = alloca [29 x i8], align 4
--- a/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@ -1,6 +1,7 @@
 # RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,ADDR64
 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32

 # Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
 #
--- a/test/CodeGen/AMDGPU/or3.ll
+++ b/test/CodeGen/AMDGPU/or3.ll
@ -22,6 +22,7 @@ define amdgpu_ps float @or3(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: or3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = or i32 %a, %b
  %result = or i32 %x, %c
@ -47,6 +48,7 @@ define amdgpu_ps float @or3_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) {
 ; GFX10-LABEL: or3_vgpr_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_or3_b32 v0, v0, s2, s3
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = or i32 %a, %b
  %result = or i32 %x, %c
@ -69,6 +71,7 @@ define amdgpu_ps float @or3_vgpr_all2(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: or3_vgpr_all2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = or i32 %b, %c
  %result = or i32 %a, %x
@ -91,6 +94,7 @@ define amdgpu_ps float @or3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: or3_vgpr_bc:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_or3_b32 v0, s2, v0, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = or i32 %a, %b
  %result = or i32 %x, %c
@ -113,6 +117,7 @@ define amdgpu_ps float @or3_vgpr_const(i32 %a, i32 %b) {
 ; GFX10-LABEL: or3_vgpr_const:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v0, 64
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = or i32 64, %b
  %result = or i32 %x, %a
--- a/test/CodeGen/AMDGPU/regbank-reassign.mir
+++ b/test/CodeGen/AMDGPU/regbank-reassign.mir
@ -49,6 +49,24 @@ body: |
    S_ENDPGM 0
 ...

+# GCN-LABEL: s11_vs_vcc{{$}}
+# GCN: $vgpr0, $vcc_lo = V_ADDC_U32_e64 killed $sgpr14, killed $vgpr0, killed $vcc_lo, 0
+---
+name:            s11_vs_vcc
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_32, preferred-register: '$sgpr11' }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    $vcc_lo = IMPLICIT_DEF
+    %2, $vcc_lo = V_ADDC_U32_e64 killed %0, killed %1, killed $vcc_lo, 0, implicit $exec
+    S_ENDPGM 0
+...
+
 # GCN-LABEL: s0_vs_s16{{$}}
 # GCN: S_AND_B32 killed renamable $sgpr14, $sgpr0,
 ---
--- a/test/CodeGen/AMDGPU/shl_add.ll
+++ b/test/CodeGen/AMDGPU/shl_add.ll
@ -22,6 +22,7 @@ define amdgpu_ps float @shl_add(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: shl_add:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, %b
  %result = add i32 %x, %c
@ -46,6 +47,7 @@ define amdgpu_ps float @shl_add_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) {
 ; GFX10-LABEL: shl_add_vgpr_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, s2, s3
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, %b
  %result = add i32 %x, %c
@ -68,6 +70,7 @@ define amdgpu_ps float @shl_add_vgpr_all(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: shl_add_vgpr_all:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, %b
  %result = add i32 %x, %c
@ -90,6 +93,7 @@ define amdgpu_ps float @shl_add_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) {
 ; GFX10-LABEL: shl_add_vgpr_ab:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, v1, s2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, %b
  %result = add i32 %x, %c
@ -112,6 +116,7 @@ define amdgpu_ps float @shl_add_vgpr_const(i32 %a, i32 %b) {
 ; GFX10-LABEL: shl_add_vgpr_const:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 3, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, 3
  %result = add i32 %x, %b
--- a/test/CodeGen/AMDGPU/shl_or.ll
+++ b/test/CodeGen/AMDGPU/shl_or.ll
@ -22,6 +22,7 @@ define amdgpu_ps float @shl_or(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: shl_or:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, %b
  %result = or i32 %x, %c
@ -45,6 +46,7 @@ define amdgpu_ps float @shl_or_vgpr_c(i32 inreg %a, i32 inreg %b, i32 %c) {
 ; GFX10-LABEL: shl_or_vgpr_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_or_b32 v0, s2, s3, v0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, %b
  %result = or i32 %x, %c
@ -67,6 +69,7 @@ define amdgpu_ps float @shl_or_vgpr_all2(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: shl_or_vgpr_all2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, %b
  %result = or i32 %c, %x
@ -89,6 +92,7 @@ define amdgpu_ps float @shl_or_vgpr_ac(i32 %a, i32 inreg %b, i32 %c) {
 ; GFX10-LABEL: shl_or_vgpr_ac:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, s2, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, %b
  %result = or i32 %x, %c
@ -111,6 +115,7 @@ define amdgpu_ps float @shl_or_vgpr_const(i32 %a, i32 %b) {
 ; GFX10-LABEL: shl_or_vgpr_const:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v1, 6
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, %b
  %result = or i32 %x, 6
@ -133,6 +138,7 @@ define amdgpu_ps float @shl_or_vgpr_const2(i32 %a, i32 %b) {
 ; GFX10-LABEL: shl_or_vgpr_const2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 6, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, 6
  %result = or i32 %x, %b
@ -155,6 +161,7 @@ define amdgpu_ps float @shl_or_vgpr_const_scalar1(i32 inreg %a, i32 %b) {
 ; GFX10-LABEL: shl_or_vgpr_const_scalar1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_or_b32 v0, s2, 6, v0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, 6
  %result = or i32 %x, %b
@ -177,6 +184,7 @@ define amdgpu_ps float @shl_or_vgpr_const_scalar2(i32 %a, i32 inreg %b) {
 ; GFX10-LABEL: shl_or_vgpr_const_scalar2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 6, s2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = shl i32 %a, 6
  %result = or i32 %x, %b
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@ -92,6 +92,7 @@ entry:
 ; GCN-DAG: s_mov_b32 s1, 1
 ; GCN-DAG: s_mov_b32 s0, 0
 ; SI-NEXT: nop 3
+; GFX10-NEXT: ; implicit-def: $vcc_hi
 ; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
 define amdgpu_ps float @smrd_hazard(<4 x i32> inreg %desc) #0 {
 main_body:
--- a/test/CodeGen/AMDGPU/sub_i1.ll
+++ b/test/CodeGen/AMDGPU/sub_i1.ll
@ -1,8 +1,10 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s


 ; GCN-LABEL: {{^}}sub_var_var_i1:
-; GCN: s_xor_b64
+; WAVE32: s_xor_b32
+; WAVE64: s_xor_b64
 define amdgpu_kernel void @sub_var_var_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
  %a = load volatile i1, i1 addrspace(1)* %in0
  %b = load volatile i1, i1 addrspace(1)* %in1
@ -12,7 +14,8 @@ define amdgpu_kernel void @sub_var_var_i1(i1 addrspace(1)* %out, i1 addrspace(1)
 }

 ; GCN-LABEL: {{^}}sub_var_imm_i1:
-; GCN: s_not_b64
+; WAVE32: s_not_b32
+; WAVE64: s_not_b64
 define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {
  %a = load volatile i1, i1 addrspace(1)* %in
  %sub = sub i1 %a, 1
@ -22,7 +25,8 @@ define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)

 ; GCN-LABEL: {{^}}sub_i1_cf:
 ; GCN: ; %endif
-; GCN: s_not_b64
+; WAVE32: s_not_b32
+; WAVE64: s_not_b64
 define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
 entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
--- a/test/CodeGen/AMDGPU/wave32.ll
+++ b/test/CodeGen/AMDGPU/wave32.ll
--- a/test/CodeGen/AMDGPU/xor3.ll
+++ b/test/CodeGen/AMDGPU/xor3.ll
@ -16,6 +16,7 @@ define amdgpu_ps float @xor3(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: xor3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor3_b32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = xor i32 %a, %b
  %result = xor i32 %x, %c
@ -33,6 +34,7 @@ define amdgpu_ps float @xor3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
 ; GFX10-LABEL: xor3_vgpr_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor3_b32 v0, s2, v0, s3
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = xor i32 %a, %b
  %result = xor i32 %x, %c
@ -50,6 +52,7 @@ define amdgpu_ps float @xor3_vgpr_all2(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: xor3_vgpr_all2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor3_b32 v0, v1, v2, v0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = xor i32 %b, %c
  %result = xor i32 %a, %x
@ -67,6 +70,7 @@ define amdgpu_ps float @xor3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: xor3_vgpr_bc:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor3_b32 v0, s2, v0, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = xor i32 %a, %b
  %result = xor i32 %x, %c
@ -84,6 +88,7 @@ define amdgpu_ps float @xor3_vgpr_const(i32 %a, i32 %b) {
 ; GFX10-LABEL: xor3_vgpr_const:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor3_b32 v0, v0, v1, 16
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = xor i32 %a, %b
  %result = xor i32 %x, 16
@ -102,6 +107,7 @@ define amdgpu_ps <2 x float> @xor3_multiuse_outer(i32 %a, i32 %b, i32 %c, i32 %x
 ; GFX10-LABEL: xor3_multiuse_outer:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor3_b32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v0, v3
 ; GFX10-NEXT:    ; return to shader part epilog
  %inner = xor i32 %a, %b
@ -123,6 +129,7 @@ define amdgpu_ps <2 x float> @xor3_multiuse_inner(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: xor3_multiuse_inner:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v0, v2
 ; GFX10-NEXT:    ; return to shader part epilog
  %inner = xor i32 %a, %b
@ -151,6 +158,7 @@ define amdgpu_ps float @xor3_uniform_vgpr(float inreg %a, float inreg %b, float
 ; GFX10-NEXT:    v_add_f32_e64 v1, s3, 2.0
 ; GFX10-NEXT:    v_add_f32_e64 v2, s2, 1.0
 ; GFX10-NEXT:    v_add_f32_e64 v0, 0x40400000, s4
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    ; return to shader part epilog
--- a/test/CodeGen/AMDGPU/xor_add.ll
+++ b/test/CodeGen/AMDGPU/xor_add.ll
@ -22,6 +22,7 @@ define amdgpu_ps float @xor_add(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: xor_add:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xad_u32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = xor i32 %a, %b
  %result = add i32 %x, %c
@ -46,6 +47,7 @@ define amdgpu_ps float @xor_add_vgpr_a(i32 %a, i32 inreg %b, i32 inreg %c) {
 ; GFX10-LABEL: xor_add_vgpr_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xad_u32 v0, v0, s2, s3
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = xor i32 %a, %b
  %result = add i32 %x, %c
@ -68,6 +70,7 @@ define amdgpu_ps float @xor_add_vgpr_all(i32 %a, i32 %b, i32 %c) {
 ; GFX10-LABEL: xor_add_vgpr_all:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xad_u32 v0, v0, v1, v2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = xor i32 %a, %b
  %result = add i32 %x, %c
@ -90,6 +93,7 @@ define amdgpu_ps float @xor_add_vgpr_ab(i32 %a, i32 %b, i32 inreg %c) {
 ; GFX10-LABEL: xor_add_vgpr_ab:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xad_u32 v0, v0, v1, s2
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = xor i32 %a, %b
  %result = add i32 %x, %c
@ -112,6 +116,7 @@ define amdgpu_ps float @xor_add_vgpr_const(i32 %a, i32 %b) {
 ; GFX10-LABEL: xor_add_vgpr_const:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_xad_u32 v0, v0, 3, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    ; return to shader part epilog
  %x = xor i32 %a, 3
  %result = add i32 %x, %b
--- a/test/MC/AMDGPU/gfx10-constant-bus.s
+++ b/test/MC/AMDGPU/gfx10-constant-bus.s
@ -33,3 +33,13 @@ v_div_fmas_f64 v[5:6], v[1:2], s[2:3], v[3:4]

 v_div_fmas_f64 v[5:6], v[1:2], s[2:3], 0x123456
 // GFX10-ERR: error: invalid operand (violates constant bus restrictions)
+
+//-----------------------------------------------------------------------------------------
+// v_mad_u64_u32 has operands of different sizes.
+// When these operands are literals, they are counted as 2 scalar values even if literals are identical.
+
+v_mad_u64_u32 v[5:6], s12, v1, 0x12345678, 0x12345678
+// GFX10: v_mad_u64_u32 v[5:6], s12, v1, 0x12345678, 0x12345678 ; encoding: [0x05,0x0c,0x76,0xd5,0x01,0xff,0xfd,0x03,0x78,0x56,0x34,0x12]
+
+v_mad_u64_u32 v[5:6], s12, s1, 0x12345678, 0x12345678
+// GFX10-ERR: error: invalid operand (violates constant bus restrictions)
--- a/test/MC/AMDGPU/wave32.s
+++ b/test/MC/AMDGPU/wave32.s
@ -0,0 +1,412 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX1032 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX1064 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX1032-ERR %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX1064-ERR %s
+
+v_cmp_ge_i32_e32 s0, v0
+// GFX1032: v_cmp_ge_i32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x0c,0x7d]
+// GFX1064: v_cmp_ge_i32_e32 vcc, s0, v0 ; encoding: [0x00,0x00,0x0c,0x7d]
+
+v_cmp_ge_i32_e32 vcc_lo, s0, v1
+// GFX1032: v_cmp_ge_i32_e32 vcc_lo, s0, v1 ; encoding: [0x00,0x02,0x0c,0x7d]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_cmp_ge_i32_e32 vcc, s0, v2
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_cmp_ge_i32_e32 vcc, s0, v2 ; encoding: [0x00,0x04,0x0c,0x7d]
+
+v_cmp_le_f16_sdwa s0, v3, v4 src0_sel:WORD_1 src1_sel:DWORD
+// GFX1032: v_cmp_le_f16_sdwa s0, v3, v4 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x08,0x96,0x7d,0x03,0x80,0x05,0x06]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_cmp_le_f16_sdwa s[0:1], v3, v4 src0_sel:WORD_1 src1_sel:DWORD
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_cmp_le_f16_sdwa s[0:1], v3, v4 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x08,0x96,0x7d,0x03,0x80,0x05,0x06]
+
+v_cmp_class_f32_e32 vcc_lo, s0, v0
+// GFX1032: v_cmp_class_f32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x10,0x7d]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_cmp_class_f32_e32 vcc, s0, v0
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_cmp_class_f32_e32 vcc, s0, v0 ; encoding: [0x00,0x00,0x10,0x7d]
+
+// TODO-GFX10: The following encoding does not match SP3's encoding, which is:
+//  [0xf9,0x04,0x1e,0x7d,0x01,0x06,0x06,0x06]
+v_cmp_class_f16_sdwa vcc_lo, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// GFX1032: v_cmp_class_f16_sdwa vcc_lo, v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0x00,0x06,0x06]
+// GFX1064-ERR: error: invalid operand for instruction
+
+// TODO-GFX10: The following encoding does not match SP3's encoding, which is:
+//  [0xf9,0x04,0x1e,0x7d,0x01,0x06,0x06,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0x00,0x06,0x06]
+
+v_cmp_class_f16_sdwa s0, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// GFX1032: v_cmp_class_f16_sdwa s0, v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0x80,0x06,0x06]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_cmp_class_f16_sdwa s[0:1], v1, v2 src0_sel:DWORD src1_sel:DWORD
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_cmp_class_f16_sdwa s[0:1], v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0x80,0x06,0x06]
+
+v_cndmask_b32_e32 v1, v2, v3,
+// GFX1032: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; encoding: [0x02,0x07,0x02,0x02]
+// GFX1064: v_cndmask_b32_e32 v1, v2, v3, vcc ; encoding: [0x02,0x07,0x02,0x02]
+
+v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+// GFX1032: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; encoding: [0x02,0x07,0x02,0x02]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_cndmask_b32_e32 v1, v2, v3, vcc
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_cndmask_b32_e32 v1, v2, v3, vcc ; encoding: [0x02,0x07,0x02,0x02]
+
+v_add_co_u32_e32 v2, vcc_lo, s0, v2
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_add_co_u32_e32 v2, vcc, s0, v2
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+// GFX1032: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; encoding: [0x03,0x09,0x06,0x50]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc ; encoding: [0x03,0x09,0x06,0x50]
+
+v_add_co_ci_u32_e32 v3, v3, v4
+// GFX1032: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; encoding: [0x03,0x09,0x06,0x50]
+// GFX1064: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc ; encoding: [0x03,0x09,0x06,0x50]
+
+v_sub_co_u32_e32 v2, vcc_lo, s0, v2
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_sub_co_u32_e32 v2, vcc, s0, v2
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_subrev_co_u32_e32 v2, vcc_lo, s0, v2
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_subrev_co_u32_e32 v2, vcc, s0, v2
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+// GFX1032: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; encoding: [0x03,0x09,0x06,0x52]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_sub_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_sub_co_ci_u32_e32 v3, vcc, v3, v4, vcc ; encoding: [0x03,0x09,0x06,0x52]
+
+v_sub_co_ci_u32_e32 v3, v3, v4
+// GFX1032: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; encoding: [0x03,0x09,0x06,0x52]
+// GFX1064: v_sub_co_ci_u32_e32 v3, vcc, v3, v4, vcc ; encoding: [0x03,0x09,0x06,0x52]
+
+v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+// GFX1032: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; encoding: [0x80,0x02,0x02,0x54]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_subrev_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_subrev_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; encoding: [0x80,0x02,0x02,0x54]
+
+v_subrev_co_ci_u32_e32 v1, 0, v1
+// GFX1032: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; encoding: [0x80,0x02,0x02,0x54]
+// GFX1064: v_subrev_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; encoding: [0x80,0x02,0x02,0x54]
+
+v_add_co_u32_sdwa v0, vcc_lo, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// GFX1032-ERR: error: invalid operand
+// GFX1064-ERR: error: invalid operand
+
+v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// GFX1032-ERR: error: instruction not supported
+// GFX1064-ERR: error: instruction not supported
+
+v_add_co_u32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_add_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x00,0x06]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_add_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_add_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x00,0x06]
+
+v_add_co_ci_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x00,0x06]
+// GFX1064: v_add_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x00,0x06]
+
+v_sub_co_u32_sdwa v0, vcc_lo, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// GFX1032-ERR: error: invalid operand
+// GFX1064-ERR: error: invalid operand
+
+v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// GFX1032-ERR: error: instruction not supported
+// GFX1064-ERR: error: instruction not supported
+
+v_sub_co_u32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_subrev_co_u32_sdwa v0, vcc_lo, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// GFX1032-ERR: error: invalid operand
+// GFX1064-ERR: error: invalid operand
+
+v_subrev_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// GFX1032-ERR: error: instruction not supported
+// GFX1064-ERR: error: instruction not supported
+
+v_subrev_co_u32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_sub_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032: v_sub_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x52,0x01,0x06,0x00,0x06]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_sub_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_sub_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x52,0x01,0x06,0x00,0x06]
+
+v_sub_co_ci_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032: v_sub_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x52,0x01,0x06,0x00,0x06]
+// GFX1064: v_sub_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x52,0x01,0x06,0x00,0x06]
+
+v_subrev_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032: v_subrev_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x54,0x01,0x06,0x00,0x06]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_subrev_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_subrev_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x54,0x01,0x06,0x00,0x06]
+
+v_subrev_co_ci_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032: v_subrev_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x54,0x01,0x06,0x00,0x06]
+// GFX1064: v_subrev_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x54,0x01,0x06,0x00,0x06]
+
+v_add_co_ci_u32 v1, sext(v1), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, sext(v1), sext(v4), vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x08,0x0e]
+// GFX1064: v_add_co_ci_u32_sdwa v1, vcc, sext(v1), sext(v4), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x08,0x0e]
+
+v_add_co_ci_u32_sdwa v1, vcc_lo, sext(v1), sext(v4), vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, sext(v1), sext(v4), vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x08,0x0e]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_add_co_ci_u32_sdwa v1, vcc, sext(v1), sext(v4), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_add_co_ci_u32_sdwa v1, vcc, sext(v1), sext(v4), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x08,0x0e]
+
+v_add_co_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_add_co_u32_dpp v5, vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_add_co_u32_dpp v5, vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_add_co_ci_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00]
+// GFX1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00]
+
+v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00]
+
+v_sub_co_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_sub_co_u32_dpp v5, vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_sub_co_u32_dpp v5, vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x00]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x00]
+
+v_subrev_co_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_subrev_co_u32_dpp v5, vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_subrev_co_u32_dpp v5, vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: not a valid operand
+// GFX1064-ERR: error: not a valid operand
+
+v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00]
+// GFX1064-ERR: error: instruction not supported on this GPU
+
+v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX1032-ERR: error: instruction not supported on this GPU
+// GFX1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00]
+
+v_add_co_u32 v0, s0, v0, v2
+// GFX1032: v_add_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_add_co_u32_e64 v0, s0, v0, v2
+// GFX1032: v_add_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_add_co_ci_u32_e64 v4, s0, v1, v5, s2
+// GFX1032: v_add_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_sub_co_u32 v0, s0, v0, v2
+// GFX1032: v_sub_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_sub_co_u32_e64 v0, s0, v0, v2
+// GFX1032: v_sub_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2
+// GFX1032: v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_subrev_co_u32 v0, s0, v0, v2
+// GFX1032: v_subrev_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_subrev_co_u32_e64 v0, s0, v0, v2
+// GFX1032: v_subrev_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2
+// GFX1032: v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x2a,0xd5,0x01,0x0b,0x0a,0x00]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_add_co_u32 v0, s[0:1], v0, v2
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
+
+v_add_co_u32_e64 v0, s[0:1], v0, v2
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
+
+v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00]
+
+v_sub_co_u32 v0, s[0:1], v0, v2
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_sub_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
+
+v_sub_co_u32_e64 v0, s[0:1], v0, v2
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_sub_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
+
+v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00]
+
+v_subrev_co_u32 v0, s[0:1], v0, v2
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_subrev_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
+
+v_subrev_co_u32_e64 v0, s[0:1], v0, v2
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_subrev_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
+
+v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x2a,0xd5,0x01,0x0b,0x0a,0x00]
+
+v_add_co_ci_u32_e64 v4, vcc_lo, v1, v5, s2
+// GFX1032: v_add_co_ci_u32_e64 v4, vcc_lo, v1, v5, s2 ; encoding: [0x04,0x6a,0x28,0xd5,0x01,0x0b,0x0a,0x00]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_add_co_ci_u32_e64 v4, vcc, v1, v5, s[2:3]
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_add_co_ci_u32_e64 v4, vcc, v1, v5, s[2:3] ; encoding: [0x04,0x6a,0x28,0xd5,0x01,0x0b,0x0a,0x00]
+
+v_add_co_ci_u32_e64 v4, s0, v1, v5, vcc_lo
+// GFX1032: v_add_co_ci_u32_e64 v4, s0, v1, v5, vcc_lo ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0xaa,0x01]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0xaa,0x01]
+
+v_div_scale_f32 v2, s2, v0, v0, v2
+// GFX1032: v_div_scale_f32 v2, s2, v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_div_scale_f32 v2, s[2:3], v0, v0, v2
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_div_scale_f32 v2, s[2:3], v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04]
+
+v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3]
+// GFX1032: v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3]
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04]
+
+v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3]
+// GFX1032: v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3] ; encoding: [0x00,0x06,0x77,0xd5,0x00,0x03,0x0a,0x04]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3] ; encoding: [0x00,0x06,0x77,0xd5,0x00,0x03,0x0a,0x04]
+
+v_mad_u64_u32 v[0:1], s6, v0, v1, v[2:3]
+// GFX1032: v_mad_u64_u32 v[0:1], s6, v0, v1, v[2:3] ; encoding: [0x00,0x06,0x76,0xd5,0x00,0x03,0x0a,0x04]
+// GFX1064-ERR: error: invalid operand for instruction
+
+v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
+// GFX1032-ERR: error: invalid operand for instruction
+// GFX1064: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3] ; encoding: [0x00,0x06,0x76,0xd5,0x00,0x03,0x0a,0x04]
+
+v_cmpx_neq_f32_e32 v0, v1
+// GFX1032: v_cmpx_neq_f32_e32 v0, v1 ; encoding: [0x00,0x03,0x3a,0x7c]
+// GFX1064: v_cmpx_neq_f32_e32 v0, v1 ; encoding: [0x00,0x03,0x3a,0x7c]
+
+v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+// GFX1032: v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x3a,0x7c,0x00,0x00,0x05,0x06]
+// GFX1064: v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x3a,0x7c,0x00,0x00,0x05,0x06]
+
+v_cmpx_eq_u32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD
+// GFX1032: v_cmpx_eq_u32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0xa5,0x7d,0x00,0x00,0x05,0x86]
+// GFX1064: v_cmpx_eq_u32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0xa5,0x7d,0x00,0x00,0x05,0x86]
+
+v_cmpx_class_f32_e64 v0, 1
+// GFX1032: v_cmpx_class_f32_e64 v0, 1 ; encoding: [0x00,0x00,0x98,0xd4,0x00,0x03,0x01,0x00]
+// GFX1064: v_cmpx_class_f32_e64 v0, 1 ; encoding: [0x00,0x00,0x98,0xd4,0x00,0x03,0x01,0x00]
+
+v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD
+// GFX1032: v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x31,0x7d,0x00,0x00,0x05,0x86]
+// GFX1064: v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x31,0x7d,0x00,0x00,0x05,0x86]
--- a/test/MC/Disassembler/AMDGPU/gfx10-sgpr-max.txt
+++ b/test/MC/Disassembler/AMDGPU/gfx10-sgpr-max.txt
@ -8,3 +8,9 @@

 # GFX10: s_mov_b32 s105, s104 ; encoding: [0x68,0x03,0xe9,0xbe]
 0x68,0x03,0xe9,0xbe
+
+# GFX10: v_cmp_eq_f32_e64 s105, v0, s105
+0x69,0x00,0x02,0xd4,0x00,0xd3,0x00,0x00
+
+# GFX10: v_cmp_eq_f32_sdwa s105, v0, s105 src0_sel:DWORD src1_sel:DWORD
+0xf9,0xd2,0x04,0x7c,0x00,0xe9,0x06,0x86
--- a/test/MC/Disassembler/AMDGPU/wave32.txt
+++ b/test/MC/Disassembler/AMDGPU/wave32.txt
@ -0,0 +1,164 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1032 %s
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1064 %s
+
+# GFX1032:   v_cmp_lt_f32_e32 vcc_lo, s2, v4
+# GFX1064:   v_cmp_lt_f32_e32 vcc, s2, v4
+0x02,0x08,0x02,0x7c
+
+# GFX1032:   v_cmp_ge_i32_e64 s2, s0, v2
+# GFX1064:   v_cmp_ge_i32_e64 s[2:3], s0, v2
+0x02,0x00,0x86,0xd4,0x00,0x04,0x02,0x00
+
+# GFX1032: v_cmp_ge_i32_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+# GFX1064: v_cmp_ge_i32_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+0xf9,0x04,0x0c,0x7d,0x00,0x00,0x05,0x06
+
+# GFX1032: v_cmp_le_f16_sdwa s0, v3, v4 src0_sel:WORD_1 src1_sel:DWORD
+# GFX1064: v_cmp_le_f16_sdwa s[0:1], v3, v4 src0_sel:WORD_1 src1_sel:DWORD
+0xf9,0x08,0x96,0x7d,0x03,0x80,0x05,0x06
+
+# GFX1032: v_cmp_class_f32_e32 vcc_lo, s0, v0
+# GFX1064: v_cmp_class_f32_e32 vcc, s0, v0
+0x00,0x00,0x10,0x7d
+
+# GFX1032: v_cmp_class_f16_sdwa vcc_lo, v1, v2 src0_sel:DWORD src1_sel:DWORD
+# GFX1064: v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+0xf9,0x04,0x1e,0x7d,0x01,0x00,0x06,0x06
+
+# GFX1032: v_cmp_class_f16_sdwa s0, v1, v2 src0_sel:DWORD src1_sel:DWORD
+# GFX1064: v_cmp_class_f16_sdwa s[0:1], v1, v2 src0_sel:DWORD src1_sel:DWORD
+0xf9,0x04,0x1e,0x7d,0x01,0x80,0x06,0x06
+
+# GFX1032: v_cndmask_b32_e32 v5, 0, v2, vcc_lo
+# GFX1064: v_cndmask_b32_e32 v5, 0, v2, vcc ;
+0x80,0x04,0x0a,0x02
+
+# GFX1032: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+# GFX1064: v_cndmask_b32_e32 v1, v2, v3, vcc ;
+0x02,0x07,0x02,0x02
+
+# GFX1032: v_add_co_u32_e64 v2, vcc_lo, s0, v2
+# GFX1064: v_add_co_u32_e64 v2, vcc, s0, v2
+0x02,0x6a,0x0f,0xd7,0x00,0x04,0x02,0x00
+
+# GFX1032: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+# GFX1064: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc ;
+0x03,0x09,0x06,0x50
+
+# GFX1032: v_sub_co_u32_e64 v2, vcc_lo, s0, v2
+# GFX1064: v_sub_co_u32_e64 v2, vcc, s0, v2
+0x02,0x6a,0x10,0xd7,0x00,0x04,0x02,0x00
+
+# GFX1032: v_subrev_co_u32_e64 v2, vcc_lo, s0, v2
+# GFX1064: v_subrev_co_u32_e64 v2, vcc, s0, v2
+0x02,0x6a,0x19,0xd7,0x00,0x04,0x02,0x00
+
+# GFX1032: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+# GFX1064: v_sub_co_ci_u32_e32 v3, vcc, v3, v4, vcc ;
+0x03,0x09,0x06,0x52
+
+# GFX1032: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+# GFX1064: v_subrev_co_ci_u32_e32 v1, vcc, 0, v1, vcc ;
+0x80,0x02,0x02,0x54
+
+# GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo  dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+# GFX1064: v_add_co_ci_u32_sdwa v1, vcc, v1, v4, vcc  dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+0xf9,0x08,0x02,0x50,0x01,0x06,0x00,0x06
+
+# GFX1032: v_sub_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo  dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+# GFX1064: v_sub_co_ci_u32_sdwa v1, vcc, v1, v4, vcc  dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+0xf9,0x08,0x02,0x52,0x01,0x06,0x00,0x06
+
+# GFX1032: v_subrev_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo  dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+# GFX1064: v_subrev_co_ci_u32_sdwa v1, vcc, v1, v4, vcc  dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+0xf9,0x08,0x02,0x54,0x01,0x06,0x00,0x06
+
+# GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, sext(v1), sext(v4), vcc_lo  dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+# GFX1064: v_add_co_ci_u32_sdwa v1, vcc, sext(v1), sext(v4), vcc  dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+0xf9,0x08,0x02,0x50,0x01,0x06,0x08,0x0e
+
+# GFX1032: v_add_nc_u32_dpp v5, v1, v2  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# GFX1064: v_add_nc_u32_dpp v5, v1, v2  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+0xfa,0x04,0x0a,0x4a,0x01,0xe4,0x00,0x00
+
+# FIXME: Results in invalid v_subrev_u16_dpp which apparently has the same encoding but does not exist in GFX10
+
+# gfx1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# gfx1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# 0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00
+
+# FIXME: Results in v_mul_lo_u16_dpp
+
+# gfx1032: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# gfx1064: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# 0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x00
+
+# FIXME: gives v_lshlrev_b16_dpp
+
+# gfx1032: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# gfx1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# 0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00
+
+# GFX1032: v_add_co_u32_e64 v0, s0, v0, v2
+# GFX1064: v_add_co_u32_e64 v0, s[0:1], v0, v2
+0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00
+
+# GFX1032: v_add_co_ci_u32_e64 v4, s0, v1, v5, s2
+# GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
+0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00
+
+# GFX1032: v_sub_co_u32_e64 v0, s0, v0, v2
+# GFX1064: v_sub_co_u32_e64 v0, s[0:1], v0, v2
+0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00
+
+# GFX1032: v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2
+# GFX1064: v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
+0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00
+
+# GFX1032: v_subrev_co_u32_e64 v0, s0, v0, v2
+# GFX1064: v_subrev_co_u32_e64 v0, s[0:1], v0, v2
+0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00
+
+# GFX1032: v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2
+# GFX1064: v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
+0x04,0x00,0x2a,0xd5,0x01,0x0b,0x0a,0x00
+
+# GFX1032: v_add_co_ci_u32_e64 v4, vcc_lo, v1, v5, s2
+# GFX1064: v_add_co_ci_u32_e64 v4, vcc, v1, v5, s[2:3]
+0x04,0x6a,0x28,0xd5,0x01,0x0b,0x0a,0x00
+
+# GFX1032: v_add_co_ci_u32_e64 v4, s0, v1, v5, vcc_lo
+# GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc ;
+0x04,0x00,0x28,0xd5,0x01,0x0b,0xaa,0x01
+
+# GFX1032: v_div_scale_f32 v2, s2, v0, v0, v2
+# GFX1064: v_div_scale_f32 v2, s[2:3], v0, v0, v2
+0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04
+
+# GFX1032: v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3]
+# GFX1064: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3]
+0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04
+
+# GFX1032: v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3]
+# GFX1064: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+0x00,0x06,0x77,0xd5,0x00,0x03,0x0a,0x04
+
+# GFX1032: v_mad_u64_u32 v[0:1], s6, v0, v1, v[2:3]
+# GFX1064: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
+0x00,0x06,0x76,0xd5,0x00,0x03,0x0a,0x04
+
+# GFX1032: v_cmpx_neq_f32_e32 v0, v1
+# GFX1064: v_cmpx_neq_f32_e32 v0, v1
+0x00,0x03,0x3a,0x7c
+
+# GFX1032: v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+# GFX1064: v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+0xf9,0x02,0x3a,0x7c,0x00,0x00,0x05,0x06
+
+# GFX1032: v_cmpx_class_f32_e64 v0, 1
+# GFX1064: v_cmpx_class_f32_e64 v0, 1
+0x00,0x00,0x98,0xd4,0x00,0x03,0x01,0x00
+
+# GFX1032: v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD
+# GFX1064: v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD
+0xf9,0x02,0x31,0x7d,0x00,0x00,0x05,0x86