From 9eac763e7b8ce079a58ac5972d924af87f2cf915 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 28 Jun 2017 21:38:50 +0000 Subject: [PATCH] AMDGPU: Remove SITypeRewriter This was an old workaround for using v16i8 in some old intrinsics for resource descriptors. llvm-svn: 306603 --- lib/Target/AMDGPU/AMDGPU.h | 1 - lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 - lib/Target/AMDGPU/CMakeLists.txt | 1 - lib/Target/AMDGPU/SITypeRewriter.cpp | 156 ------- test/CodeGen/AMDGPU/bug-vopc-commute.ll | 6 +- test/CodeGen/AMDGPU/llvm.SI.load.dword.ll | 22 +- test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll | 20 +- test/CodeGen/AMDGPU/mubuf.ll | 22 +- test/CodeGen/AMDGPU/ret_jump.ll | 7 +- .../AMDGPU/scheduler-subrange-crash.ll | 12 +- test/CodeGen/AMDGPU/sgpr-copy.ll | 94 ++--- test/CodeGen/AMDGPU/si-lod-bias.ll | 17 +- test/CodeGen/AMDGPU/si-sgpr-spill.ll | 398 +++++++++--------- test/CodeGen/AMDGPU/si-spill-cf.ll | 136 +++--- test/CodeGen/AMDGPU/smrd.ll | 48 +-- test/CodeGen/AMDGPU/split-smrd.ll | 4 +- .../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 20 +- 17 files changed, 401 insertions(+), 564 deletions(-) delete mode 100644 lib/Target/AMDGPU/SITypeRewriter.cpp diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 55d18c3f364..5a799b2d88d 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -36,7 +36,6 @@ FunctionPass *createR600ControlFlowFinalizer(); FunctionPass *createAMDGPUCFGStructurizerPass(); // SI Passes -FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 04fe9f68980..425fd35d47d 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -720,7 +720,6 @@ bool GCNPassConfig::addPreISel() { addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions } addPass(createSinkingPass()); - addPass(createSITypeRewriter()); addPass(createAMDGPUAnnotateUniformValues()); if (!LateCFGStructurize) { addPass(createSIAnnotateControlFlowPass()); diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index e30844f082c..917d9cfa690 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -96,7 +96,6 @@ add_llvm_target(AMDGPUCodeGen SIPeepholeSDWA.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp - SITypeRewriter.cpp SIWholeQuadMode.cpp GCNIterativeScheduler.cpp GCNMinRegStrategy.cpp diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp deleted file mode 100644 index aad68537f77..00000000000 --- a/lib/Target/AMDGPU/SITypeRewriter.cpp +++ /dev/null @@ -1,156 +0,0 @@ -//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass removes performs the following type substitution on all -/// non-compute shaders: -/// -/// v16i8 => i128 -/// - v16i8 is used for constant memory resource descriptors. This type is -/// legal for some compute APIs, and we don't want to declare it as legal -/// in the backend, because we want the legalizer to expand all v16i8 -/// operations. -/// v1* => * -/// - Having v1* types complicates the legalizer and we can easily replace -/// - them with the element type. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" - -using namespace llvm; - -namespace { - -class SITypeRewriter : public FunctionPass, - public InstVisitor { - - static char ID; - Module *Mod; - Type *v16i8; - Type *v4i32; - -public: - SITypeRewriter() : FunctionPass(ID) { } - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - StringRef getPassName() const override { return "SI Type Rewriter"; } - void visitLoadInst(LoadInst &I); - void visitCallInst(CallInst &I); - void visitBitCast(BitCastInst &I); -}; - -} // End anonymous namespace - -char SITypeRewriter::ID = 0; - -bool SITypeRewriter::doInitialization(Module &M) { - Mod = &M; - v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); - v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4); - return false; -} - -bool SITypeRewriter::runOnFunction(Function &F) { - if (!AMDGPU::isShader(F.getCallingConv())) - return false; - - visit(F); - visit(F); - - return false; -} - -void SITypeRewriter::visitLoadInst(LoadInst &I) { - Value *Ptr = I.getPointerOperand(); - Type *PtrTy = Ptr->getType(); - Type *ElemTy = PtrTy->getPointerElementType(); - IRBuilder<> Builder(&I); - if (ElemTy == v16i8) { - Value *BitCast = Builder.CreateBitCast(Ptr, - PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); - LoadInst *Load = Builder.CreateLoad(BitCast); - SmallVector, 8> MD; - I.getAllMetadataOtherThanDebugLoc(MD); - for (unsigned i = 0, e = MD.size(); i != e; ++i) { - Load->setMetadata(MD[i].first, MD[i].second); - } - Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType()); - I.replaceAllUsesWith(BitCastLoad); - I.eraseFromParent(); - } -} - -void SITypeRewriter::visitCallInst(CallInst &I) { - IRBuilder<> Builder(&I); - - SmallVector Args; - SmallVector Types; - bool NeedToReplace = false; - Function *F = I.getCalledFunction(); - if (!F) - return; - - std::string Name = F->getName(); - for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { - Value *Arg = I.getArgOperand(i); - if (Arg->getType() == v16i8) { - Args.push_back(Builder.CreateBitCast(Arg, v4i32)); - Types.push_back(v4i32); - NeedToReplace = true; - Name = Name + ".v4i32"; - } else if (Arg->getType()->isVectorTy() && - Arg->getType()->getVectorNumElements() == 1 && - Arg->getType()->getVectorElementType() == - Type::getInt32Ty(I.getContext())){ - Type *ElementTy = Arg->getType()->getVectorElementType(); - std::string TypeName = "i32"; - InsertElementInst *Def = cast(Arg); - Args.push_back(Def->getOperand(1)); - Types.push_back(ElementTy); - std::string VecTypeName = "v1" + TypeName; - Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName); - NeedToReplace = true; - } else { - Args.push_back(Arg); - Types.push_back(Arg->getType()); - } - } - - if (!NeedToReplace) { - return; - } - Function *NewF = Mod->getFunction(Name); - if (!NewF) { - NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod); - NewF->setAttributes(F->getAttributes()); - } - I.replaceAllUsesWith(Builder.CreateCall(NewF, Args)); - I.eraseFromParent(); -} - -void SITypeRewriter::visitBitCast(BitCastInst &I) { - IRBuilder<> Builder(&I); - if (I.getDestTy() != v4i32) { - return; - } - - if (BitCastInst *Op = dyn_cast(I.getOperand(0))) { - if (Op->getSrcTy() == v4i32) { - I.replaceAllUsesWith(Op->getOperand(0)); - I.eraseFromParent(); - } - } -} - -FunctionPass *llvm::createSITypeRewriter() { - return new SITypeRewriter(); -} diff --git a/test/CodeGen/AMDGPU/bug-vopc-commute.ll b/test/CodeGen/AMDGPU/bug-vopc-commute.ll index 7c02d838546..e951b5e0892 100644 --- a/test/CodeGen/AMDGPU/bug-vopc-commute.ll +++ b/test/CodeGen/AMDGPU/bug-vopc-commute.ll @@ -8,8 +8,8 @@ ; of which were in SGPRs. define amdgpu_vs float @main(i32 %v) { main_body: - %d1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 960) - %d2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 976) + %d1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 960) + %d2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 976) br i1 undef, label %ENDIF56, label %IF57 IF57: ; preds = %ENDIF @@ -41,7 +41,7 @@ ENDIF62: ; preds = %ENDIF59 } ; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #0 +declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0 attributes #0 = { nounwind readnone } attributes #1 = { readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll index 51f564d9690..564d2b32964 100644 --- a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll +++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll @@ -14,24 +14,24 @@ ; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc -define amdgpu_vs void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) { +define amdgpu_vs void @main([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <4 x i32>] addrspace(2)* byval %arg3, [17 x <4 x i32>] addrspace(2)* inreg %arg4, [17 x <4 x i32>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) { main_body: - %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1 - %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 + %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(2)* %arg3, i64 0, i32 1 + %tmp10 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 %tmp11 = shl i32 %arg6, 2 - %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0) + %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0) %tmp13 = bitcast i32 %tmp12 to float - %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0) + %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0) %tmp15 = bitcast i32 %tmp14 to float - %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0) + %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0) %tmp17 = bitcast i32 %tmp16 to float - %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0) + %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0) %tmp19 = bitcast i32 %tmp18 to float - %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0) + %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0) %tmp21 = bitcast i32 %tmp20 to float - %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0) + %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0) %tmp23 = bitcast i32 %tmp22 to float call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp13, float %tmp15, float %tmp17, float %tmp19, i1 false, i1 false) @@ -40,10 +40,10 @@ main_body: } ; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 ; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0 +declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll index cd9c082ed94..01b76422c03 100644 --- a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll +++ b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll @@ -5,7 +5,7 @@ ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 glc slc define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) ret void @@ -15,7 +15,7 @@ define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) { ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:32 glc slc define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1, i32 1, i32 0) ret void @@ -25,7 +25,7 @@ define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) { ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, {{s[0-9]+}} idxen offset:32 glc slc define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffset) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, i32 4, i32 %vaddr, i32 %soffset, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1, i32 1, i32 0) ret void @@ -35,7 +35,7 @@ define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffs ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 0, i32 0, i32 0) ret void @@ -45,7 +45,7 @@ define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) { ;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 offen offset:24 glc slc define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata, i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) ret void @@ -55,7 +55,7 @@ define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) { ;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:11, nfmt:4, 0 offen offset:16 glc slc define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) { %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata, + call void @llvm.SI.tbuffer.store.v2i32(<4 x i32> undef, <2 x i32> %vdata, i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) ret void @@ -64,12 +64,12 @@ define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) { ;CHECK-LABEL: {{^}}test4: ;CHECK: tbuffer_store_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:4, nfmt:4, 0 offen offset:8 glc slc define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) { - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata, + call void @llvm.SI.tbuffer.store.i32(<4 x i32> undef, i32 %vdata, i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) ret void } -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll index d883b87ec40..b23b21118aa 100644 --- a/test/CodeGen/AMDGPU/mubuf.ll +++ b/test/CodeGen/AMDGPU/mubuf.ll @@ -55,14 +55,14 @@ entry: ; CHECK-LABEL: {{^}}soffset_max_imm: ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc -define amdgpu_gs void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { +define amdgpu_gs void @soffset_max_imm([6 x <4 x i32>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { main_body: - %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 - %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 + %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(2)* %0, i32 0, i32 0 + %tmp1 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) %tmp4 = add i32 %6, 16 - %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32> + %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) ret void } @@ -74,14 +74,14 @@ main_body: ; CHECK-LABEL: {{^}}soffset_no_fold: ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41 ; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc -define amdgpu_gs void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { +define amdgpu_gs void @soffset_no_fold([6 x <4 x i32>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { main_body: - %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 - %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 + %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(2)* %0, i32 0, i32 0 + %tmp1 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) %tmp4 = add i32 %6, 16 - %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32> + %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) ret void } @@ -176,7 +176,7 @@ define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { ret void } -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) attributes #0 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll index e7a05d94cdc..1acae60f305 100644 --- a/test/CodeGen/AMDGPU/ret_jump.ll +++ b/test/CodeGen/AMDGPU/ret_jump.ll @@ -23,7 +23,7 @@ ; GCN-NEXT: [[RET_BB]]: ; GCN-NEXT: ; return ; GCN-NEXT: .Lfunc_end0 -define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { +define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { entry: %i.i = extractelement <2 x i32> %arg7, i32 0 %j.i = extractelement <2 x i32> %arg7, i32 1 @@ -75,7 +75,7 @@ ret.bb: ; preds = %else, %main_body ; GCN-NEXT: s_waitcnt ; GCN-NEXT: ; return ; GCN-NEXT: .Lfunc_end -define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { +define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 { main_body: %i.i = extractelement <2 x i32> %arg7, i32 0 %j.i = extractelement <2 x i32> %arg7, i32 1 @@ -118,9 +118,6 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 ; Function Attrs: nounwind readnone declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - ; Function Attrs: nounwind readnone declare float @llvm.fabs.f32(float) #1 diff --git a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll index 47e32724d9c..5edc2c5c9b7 100644 --- a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll +++ b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll @@ -15,16 +15,16 @@ target triple = "amdgcn--" define amdgpu_gs void @main(i32 inreg %arg) #0 { main_body: - %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 20) - %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 24) - %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 48) + %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 20) + %tmp1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 24) + %tmp2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 48) %array_vector3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 3 %array_vector5 = insertelement <4 x float> , float %tmp, i32 1 %array_vector6 = insertelement <4 x float> %array_vector5, float undef, i32 2 %array_vector9 = insertelement <4 x float> , float %tmp1, i32 1 %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2 %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1) %bc = bitcast <4 x float> %array_vector3 to <4 x i32> %tmp4 = extractelement <4 x i32> %bc, i32 undef @@ -45,8 +45,8 @@ main_body: ret void } -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #2 +declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 +declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3 attributes #0 = { nounwind "target-cpu"="tonga" } diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll index 5c20e9a8d58..931051102cd 100644 --- a/test/CodeGen/AMDGPU/sgpr-copy.ll +++ b/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -4,13 +4,13 @@ ; CHECK-LABEL: {{^}}phi1: ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] -define amdgpu_ps void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @phi1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 - %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 0) - %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16) - %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 32) + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0) + %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) + %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32) %tmp24 = fptosi float %tmp22 to i32 %tmp25 = icmp ne i32 %tmp24, 0 br i1 %tmp25, label %ENDIF, label %ELSE @@ -28,29 +28,29 @@ ENDIF: ; preds = %ELSE, %main_body ; Make sure this program doesn't crash ; CHECK-LABEL: {{^}}phi2: -define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 { +define amdgpu_ps void @phi2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 { main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 - %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16) - %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 32) - %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 36) - %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 40) - %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 48) - %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 52) - %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 56) - %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 64) - %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 68) - %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 72) - %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 76) - %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 80) - %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 84) - %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 88) - %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 92) + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) + %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32) + %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 36) + %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 40) + %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 48) + %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 52) + %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 56) + %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 64) + %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 68) + %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 72) + %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 76) + %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 80) + %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 84) + %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 88) + %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 92) %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0 %tmp37 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp36, !tbaa !0 - %tmp38 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0 - %tmp39 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp38, !tbaa !0 + %tmp38 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg1, i32 0 + %tmp39 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp38, !tbaa !0 %i.i = extractelement <2 x i32> %arg5, i32 0 %j.i = extractelement <2 x i32> %arg5, i32 1 %i.f.i = bitcast i32 %i.i to float @@ -85,7 +85,7 @@ main_body: %tmp46 = bitcast float %p2.i24 to i32 %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0 %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1 - %tmp39.bc = bitcast <16 x i8> %tmp39 to <4 x i32> + %tmp39.bc = bitcast <4 x i32> %tmp39 to <4 x i32> %a.bc.i = bitcast <2 x i32> %tmp48 to <2 x float> %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp50 = extractelement <4 x float> %tmp1, i32 2 @@ -173,14 +173,14 @@ ENDIF24: ; preds = %IF25, %ENDIF ; We just want ot make sure the program doesn't crash ; CHECK-LABEL: {{^}}loop: -define amdgpu_ps void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @loop(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 - %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 0) - %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 4) - %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 8) - %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 12) + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0) + %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 4) + %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 8) + %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 12) %tmp25 = fptosi float %tmp24 to i32 %tmp26 = bitcast i32 %tmp25 to float %tmp27 = bitcast float %tmp26 to i32 @@ -226,17 +226,17 @@ ENDIF: ; preds = %LOOP ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}} ; CHECK: exp ; CHECK: s_endpgm -define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define amdgpu_ps void @sample_v3([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: - %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 - %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 16) + %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0 + %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 16) %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0 %tmp24 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp23, !tbaa !0 - %tmp25 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0 - %tmp26 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp25, !tbaa !0 + %tmp25 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0 + %tmp26 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp25, !tbaa !0 %tmp27 = fcmp oeq float %tmp22, 0.000000e+00 - %tmp26.bc = bitcast <16 x i8> %tmp26 to <4 x i32> + %tmp26.bc = bitcast <4 x i32> %tmp26 to <4 x i32> br i1 %tmp27, label %if, label %else if: ; preds = %entry @@ -290,7 +290,7 @@ endif: ; preds = %if1, %if0, %entry ; This test is just checking that we don't crash / assertion fail. ; CHECK-LABEL: {{^}}copy2: ; CHECK: s_endpgm -define amdgpu_ps void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define amdgpu_ps void @copy2([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: br label %LOOP68 @@ -326,11 +326,11 @@ ENDIF69: ; preds = %LOOP68 ; [[END]]: ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}} ; CHECK: s_endpgm -define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { +define amdgpu_ps void @sample_rsrc([6 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { bb: - %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0 - %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !3 - %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16) + %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg1, i32 0, i32 0 + %tmp22 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !3 + %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp22, i32 16) %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0 %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !3 %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0 @@ -420,7 +420,7 @@ declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll index 3a7359ea4ff..42249806650 100644 --- a/test/CodeGen/AMDGPU/si-lod-bias.ll +++ b/test/CodeGen/AMDGPU/si-lod-bias.ll @@ -6,15 +6,15 @@ ; GCN-LABEL: {{^}}main: ; GCN: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf -define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @main(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 - %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16) + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) %tmp22 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0 %tmp23 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp22, !tbaa !0 - %tmp24 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0 - %tmp25 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp24, !tbaa !0 + %tmp24 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg1, i32 0 + %tmp25 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp24, !tbaa !0 %i.i = extractelement <2 x i32> %arg5, i32 0 %j.i = extractelement <2 x i32> %arg5, i32 1 %i.f.i = bitcast i32 %i.i to float @@ -34,9 +34,8 @@ main_body: %tmp32 = insertelement <4 x i32> %tmp31, i32 %tmp29, i32 1 %tmp33 = insertelement <4 x i32> %tmp32, i32 %tmp30, i32 2 %tmp34 = insertelement <4 x i32> %tmp33, i32 undef, i32 3 - %tmp25.bc = bitcast <16 x i8> %tmp25 to <4 x i32> %tmp34.bc = bitcast <4 x i32> %tmp34 to <4 x float> - %tmp35 = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> %tmp34.bc, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp35 = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> %tmp34.bc, <8 x i32> %tmp23, <4 x i32> %tmp25, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp36 = extractelement <4 x float> %tmp35, i32 0 %tmp37 = extractelement <4 x float> %tmp35, i32 1 %tmp38 = extractelement <4 x float> %tmp35, i32 2 @@ -49,7 +48,7 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll index 8731e74d63a..3e70f2c7782 100644 --- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -24,81 +24,81 @@ ; GCN: s_endpgm ; TOVGPR: ScratchSize: 0{{$}} -define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { +define amdgpu_ps void @main([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: - %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 - %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 96) - %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 100) - %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 104) - %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 112) - %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 116) - %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 120) - %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 128) - %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 132) - %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 140) - %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 144) - %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 160) - %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 176) - %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 180) - %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 184) - %tmp36 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 192) - %tmp37 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 196) - %tmp38 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 200) - %tmp39 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 208) - %tmp40 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 212) - %tmp41 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 216) - %tmp42 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 224) - %tmp43 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 240) - %tmp44 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 244) - %tmp45 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 248) - %tmp46 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 256) - %tmp47 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 272) - %tmp48 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 276) - %tmp49 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 280) - %tmp50 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 288) - %tmp51 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 292) - %tmp52 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 296) - %tmp53 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 304) - %tmp54 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 308) - %tmp55 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 312) - %tmp56 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 368) - %tmp57 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 372) - %tmp58 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 376) - %tmp59 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 384) + %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0 + %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 96) + %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 100) + %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 104) + %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 112) + %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 116) + %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 120) + %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 128) + %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 132) + %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 140) + %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 144) + %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 160) + %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 176) + %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 180) + %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 184) + %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 192) + %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 196) + %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 200) + %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 208) + %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 212) + %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 216) + %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 224) + %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 240) + %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 244) + %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 248) + %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 256) + %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 272) + %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 276) + %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 280) + %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 288) + %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 292) + %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 296) + %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 304) + %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 308) + %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 312) + %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 368) + %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 372) + %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 376) + %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 384) %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0 %tmp61 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp60, !tbaa !0 - %tmp62 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0 - %tmp63 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp62, !tbaa !0 - %tmp63.bc = bitcast <16 x i8> %tmp63 to <4 x i32> + %tmp62 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0 + %tmp63 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp62, !tbaa !0 + %tmp63.bc = bitcast <4 x i32> %tmp63 to <4 x i32> %tmp64 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1 %tmp65 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp64, !tbaa !0 - %tmp66 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 1 - %tmp67 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp66, !tbaa !0 + %tmp66 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 1 + %tmp67 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp66, !tbaa !0 %tmp68 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2 %tmp69 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp68, !tbaa !0 - %tmp70 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 2 - %tmp71 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp70, !tbaa !0 + %tmp70 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 2 + %tmp71 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp70, !tbaa !0 %tmp72 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3 %tmp73 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp72, !tbaa !0 - %tmp74 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 3 - %tmp75 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp74, !tbaa !0 + %tmp74 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 3 + %tmp75 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp74, !tbaa !0 %tmp76 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4 %tmp77 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp76, !tbaa !0 - %tmp78 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 4 - %tmp79 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp78, !tbaa !0 + %tmp78 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 4 + %tmp79 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp78, !tbaa !0 %tmp80 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5 %tmp81 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp80, !tbaa !0 - %tmp82 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 5 - %tmp83 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp82, !tbaa !0 + %tmp82 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 5 + %tmp83 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp82, !tbaa !0 %tmp84 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6 %tmp85 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp84, !tbaa !0 - %tmp86 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 6 - %tmp87 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp86, !tbaa !0 + %tmp86 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 6 + %tmp87 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp86, !tbaa !0 %tmp88 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7 %tmp89 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp88, !tbaa !0 - %tmp90 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7 - %tmp91 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp90, !tbaa !0 + %tmp90 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 7 + %tmp91 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp90, !tbaa !0 %i.i = extractelement <2 x i32> %arg6, i32 0 %j.i = extractelement <2 x i32> %arg6, i32 1 %i.f.i = bitcast i32 %i.i to float @@ -410,7 +410,7 @@ IF67: ; preds = %LOOP65 %tmp274 = insertelement <8 x i32> %tmp273, i32 %tmp268, i32 5 %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6 %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7 - %tmp67.bc = bitcast <16 x i8> %tmp67 to <4 x i32> + %tmp67.bc = bitcast <4 x i32> %tmp67 to <4 x i32> %tmp276.bc = bitcast <8 x i32> %tmp276 to <8 x float> %tmp277 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp276.bc, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp278 = extractelement <4 x float> %tmp277, i32 0 @@ -432,7 +432,7 @@ IF67: ; preds = %LOOP65 %tmp294 = insertelement <8 x i32> %tmp293, i32 %tmp288, i32 5 %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6 %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7 - %tmp83.bc = bitcast <16 x i8> %tmp83 to <4 x i32> + %tmp83.bc = bitcast <4 x i32> %tmp83 to <4 x i32> %tmp296.bc = bitcast <8 x i32> %tmp296 to <8 x float> %tmp297 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp296.bc, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp298 = extractelement <4 x float> %tmp297, i32 0 @@ -452,7 +452,7 @@ IF67: ; preds = %LOOP65 %tmp312 = insertelement <8 x i32> %tmp311, i32 %tmp306, i32 5 %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6 %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7 - %tmp79.bc = bitcast <16 x i8> %tmp79 to <4 x i32> + %tmp79.bc = bitcast <4 x i32> %tmp79 to <4 x i32> %tmp314.bc = bitcast <8 x i32> %tmp314 to <8 x float> %tmp315 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp314.bc, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp316 = extractelement <4 x float> %tmp315, i32 0 @@ -515,7 +515,7 @@ IF67: ; preds = %LOOP65 %tmp372 = insertelement <8 x i32> %tmp371, i32 %tmp366, i32 5 %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6 %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7 - %tmp71.bc = bitcast <16 x i8> %tmp71 to <4 x i32> + %tmp71.bc = bitcast <4 x i32> %tmp71 to <4 x i32> %tmp374.bc = bitcast <8 x i32> %tmp374 to <8 x float> %tmp375 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp374.bc, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp376 = extractelement <4 x float> %tmp375, i32 0 @@ -571,7 +571,7 @@ IF67: ; preds = %LOOP65 %tmp426 = insertelement <8 x i32> %tmp425, i32 %tmp420, i32 5 %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6 %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7 - %tmp87.bc = bitcast <16 x i8> %tmp87 to <4 x i32> + %tmp87.bc = bitcast <4 x i32> %tmp87 to <4 x i32> %tmp428.bc = bitcast <8 x i32> %tmp428 to <8 x float> %tmp429 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp428.bc, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp430 = extractelement <4 x float> %tmp429, i32 0 @@ -624,7 +624,7 @@ IF67: ; preds = %LOOP65 %tmp467 = insertelement <4 x i32> %tmp466, i32 %tmp464, i32 1 %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2 %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3 - %tmp91.bc = bitcast <16 x i8> %tmp91 to <4 x i32> + %tmp91.bc = bitcast <4 x i32> %tmp91 to <4 x i32> %tmp469.bc = bitcast <4 x i32> %tmp469 to <4 x float> %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp469.bc, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %tmp471 = extractelement <4 x float> %tmp470, i32 0 @@ -727,7 +727,7 @@ IF67: ; preds = %LOOP65 %tmp568 = insertelement <8 x i32> %tmp567, i32 %tmp562, i32 5 %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6 %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7 - %tmp75.bc = bitcast <16 x i8> %tmp75 to <4 x i32> + %tmp75.bc = bitcast <4 x i32> %tmp75 to <4 x i32> %tmp570.bc = bitcast <8 x i32> %tmp570 to <8 x float> %tmp571 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp570.bc, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp572 = extractelement <4 x float> %tmp571, i32 0 @@ -778,149 +778,149 @@ ENDIF66: ; preds = %LOOP65 ; GCN-LABEL: {{^}}main1: ; GCN: s_endpgm ; TOVGPR: ScratchSize: 0{{$}} -define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define amdgpu_ps void @main1([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { main_body: - %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 - %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 0) - %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 4) - %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 8) - %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 12) - %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 28) - %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 48) - %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 52) - %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 56) - %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 64) - %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 68) - %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 72) - %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 76) - %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 128) - %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 132) - %tmp36 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 144) - %tmp37 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 148) - %tmp38 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 152) - %tmp39 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 160) - %tmp40 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 164) - %tmp41 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 168) - %tmp42 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 172) - %tmp43 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 176) - %tmp44 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 180) - %tmp45 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 184) - %tmp46 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 192) - %tmp47 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 196) - %tmp48 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 200) - %tmp49 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 208) - %tmp50 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 212) - %tmp51 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 216) - %tmp52 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 220) - %tmp53 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 236) - %tmp54 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 240) - %tmp55 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 244) - %tmp56 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 248) - %tmp57 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 252) - %tmp58 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 256) - %tmp59 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 260) - %tmp60 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 264) - %tmp61 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 268) - %tmp62 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 272) - %tmp63 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 276) - %tmp64 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 280) - %tmp65 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 284) - %tmp66 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 288) - %tmp67 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 292) - %tmp68 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 464) - %tmp69 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 468) - %tmp70 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 472) - %tmp71 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 496) - %tmp72 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 500) - %tmp73 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 504) - %tmp74 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 512) - %tmp75 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 516) - %tmp76 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 524) - %tmp77 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 532) - %tmp78 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 536) - %tmp79 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 540) - %tmp80 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 544) - %tmp81 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 548) - %tmp82 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 552) - %tmp83 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 556) - %tmp84 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 560) - %tmp85 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 564) - %tmp86 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 568) - %tmp87 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 572) - %tmp88 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 576) - %tmp89 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 580) - %tmp90 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 584) - %tmp91 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 588) - %tmp92 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 592) - %tmp93 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 596) - %tmp94 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 600) - %tmp95 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 604) - %tmp96 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 608) - %tmp97 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 612) - %tmp98 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 616) - %tmp99 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 624) - %tmp100 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 628) - %tmp101 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 632) - %tmp102 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 636) - %tmp103 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 640) - %tmp104 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 644) - %tmp105 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 648) - %tmp106 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 652) - %tmp107 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 656) - %tmp108 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 660) - %tmp109 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 664) - %tmp110 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 668) - %tmp111 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 672) - %tmp112 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 676) - %tmp113 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 680) - %tmp114 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 684) - %tmp115 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 688) - %tmp116 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 692) - %tmp117 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 696) - %tmp118 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 700) - %tmp119 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 704) - %tmp120 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 708) - %tmp121 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 712) - %tmp122 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 716) - %tmp123 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 864) - %tmp124 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 868) + %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0 + %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 0) + %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 4) + %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 8) + %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 12) + %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 28) + %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 48) + %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 52) + %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 56) + %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 64) + %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 68) + %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 72) + %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 76) + %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 128) + %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 132) + %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 144) + %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 148) + %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 152) + %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 160) + %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 164) + %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 168) + %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 172) + %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 176) + %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 180) + %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 184) + %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 192) + %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 196) + %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 200) + %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 208) + %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 212) + %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 216) + %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 220) + %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 236) + %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 240) + %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 244) + %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 248) + %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 252) + %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 256) + %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 260) + %tmp60 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 264) + %tmp61 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 268) + %tmp62 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 272) + %tmp63 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 276) + %tmp64 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 280) + %tmp65 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 284) + %tmp66 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 288) + %tmp67 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 292) + %tmp68 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 464) + %tmp69 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 468) + %tmp70 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 472) + %tmp71 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 496) + %tmp72 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 500) + %tmp73 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 504) + %tmp74 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 512) + %tmp75 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 516) + %tmp76 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 524) + %tmp77 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 532) + %tmp78 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 536) + %tmp79 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 540) + %tmp80 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 544) + %tmp81 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 548) + %tmp82 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 552) + %tmp83 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 556) + %tmp84 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 560) + %tmp85 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 564) + %tmp86 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 568) + %tmp87 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 572) + %tmp88 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 576) + %tmp89 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 580) + %tmp90 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 584) + %tmp91 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 588) + %tmp92 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 592) + %tmp93 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 596) + %tmp94 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 600) + %tmp95 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 604) + %tmp96 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 608) + %tmp97 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 612) + %tmp98 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 616) + %tmp99 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 624) + %tmp100 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 628) + %tmp101 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 632) + %tmp102 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 636) + %tmp103 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 640) + %tmp104 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 644) + %tmp105 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 648) + %tmp106 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 652) + %tmp107 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 656) + %tmp108 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 660) + %tmp109 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 664) + %tmp110 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 668) + %tmp111 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 672) + %tmp112 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 676) + %tmp113 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 680) + %tmp114 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 684) + %tmp115 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 688) + %tmp116 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 692) + %tmp117 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 696) + %tmp118 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 700) + %tmp119 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 704) + %tmp120 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 708) + %tmp121 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 712) + %tmp122 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 716) + %tmp123 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 864) + %tmp124 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 868) %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0 %tmp126 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp125, !tbaa !0 - %tmp127 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0 - %tmp128 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp127, !tbaa !0 + %tmp127 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0 + %tmp128 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp127, !tbaa !0 %tmp129 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1 %tmp130 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp129, !tbaa !0 - %tmp131 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 1 - %tmp132 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp131, !tbaa !0 + %tmp131 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 1 + %tmp132 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp131, !tbaa !0 %tmp133 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2 %tmp134 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp133, !tbaa !0 - %tmp135 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 2 - %tmp136 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp135, !tbaa !0 + %tmp135 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 2 + %tmp136 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp135, !tbaa !0 %tmp137 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3 %tmp138 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp137, !tbaa !0 - %tmp139 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 3 - %tmp140 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp139, !tbaa !0 + %tmp139 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 3 + %tmp140 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp139, !tbaa !0 %tmp141 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4 %tmp142 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp141, !tbaa !0 - %tmp143 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 4 - %tmp144 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp143, !tbaa !0 + %tmp143 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 4 + %tmp144 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp143, !tbaa !0 %tmp145 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5 %tmp146 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp145, !tbaa !0 - %tmp147 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 5 - %tmp148 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp147, !tbaa !0 + %tmp147 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 5 + %tmp148 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp147, !tbaa !0 %tmp149 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6 %tmp150 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp149, !tbaa !0 - %tmp151 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 6 - %tmp152 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp151, !tbaa !0 + %tmp151 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 6 + %tmp152 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp151, !tbaa !0 %tmp153 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7 %tmp154 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp153, !tbaa !0 - %tmp155 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7 - %tmp156 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp155, !tbaa !0 + %tmp155 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 7 + %tmp156 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp155, !tbaa !0 %tmp157 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 8 %tmp158 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp157, !tbaa !0 - %tmp159 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 8 - %tmp160 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp159, !tbaa !0 + %tmp159 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 8 + %tmp160 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp159, !tbaa !0 %tmp161 = fcmp ugt float %arg17, 0.000000e+00 %tmp162 = select i1 %tmp161, float 1.000000e+00, float 0.000000e+00 %i.i = extractelement <2 x i32> %arg6, i32 0 @@ -1144,7 +1144,7 @@ main_body: %tmp222 = bitcast float %p2.i126 to i32 %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0 %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1 - %tmp132.bc = bitcast <16 x i8> %tmp132 to <4 x i32> + %tmp132.bc = bitcast <4 x i32> %tmp132 to <4 x i32> %tmp224.bc = bitcast <2 x i32> %tmp224 to <2 x float> %tmp225 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp224.bc, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp226 = extractelement <4 x float> %tmp225, i32 0 @@ -1218,7 +1218,7 @@ LOOP: ; preds = %LOOP, %main_body %tmp279 = insertelement <4 x i32> %tmp278, i32 %tmp277, i32 1 %tmp280 = insertelement <4 x i32> %tmp279, i32 0, i32 2 %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3 - %tmp148.bc = bitcast <16 x i8> %tmp148 to <4 x i32> + %tmp148.bc = bitcast <4 x i32> %tmp148 to <4 x i32> %tmp281.bc = bitcast <4 x i32> %tmp281 to <4 x float> %tmp282 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp281.bc, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp283 = extractelement <4 x float> %tmp282, i32 3 @@ -1283,7 +1283,7 @@ IF189: ; preds = %LOOP %tmp339 = bitcast float %tmp335 to i32 %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0 %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1 - %tmp136.bc = bitcast <16 x i8> %tmp136 to <4 x i32> + %tmp136.bc = bitcast <4 x i32> %tmp136 to <4 x i32> %a.bc.i = bitcast <2 x i32> %tmp341 to <2 x float> %tmp0 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp343 = extractelement <4 x float> %tmp0, i32 0 @@ -1317,7 +1317,7 @@ IF189: ; preds = %LOOP %tmp359 = bitcast float %tmp337 to i32 %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0 %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1 - %tmp152.bc = bitcast <16 x i8> %tmp152 to <4 x i32> + %tmp152.bc = bitcast <4 x i32> %tmp152 to <4 x i32> %a.bc.i3 = bitcast <2 x i32> %tmp361 to <2 x float> %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i3, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp363 = extractelement <4 x float> %tmp1, i32 2 @@ -1329,7 +1329,7 @@ IF189: ; preds = %LOOP %tmp369 = bitcast float %tmp311 to i32 %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0 %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1 - %tmp140.bc = bitcast <16 x i8> %tmp140 to <4 x i32> + %tmp140.bc = bitcast <4 x i32> %tmp140 to <4 x i32> %a.bc.i2 = bitcast <2 x i32> %tmp371 to <2 x float> %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i2, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp373 = extractelement <4 x float> %tmp2, i32 0 @@ -1347,7 +1347,7 @@ IF189: ; preds = %LOOP %tmp383 = bitcast float %tmp321 to i32 %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0 %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1 - %tmp144.bc = bitcast <16 x i8> %tmp144 to <4 x i32> + %tmp144.bc = bitcast <4 x i32> %tmp144 to <4 x i32> %a.bc.i1 = bitcast <2 x i32> %tmp385 to <2 x float> %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp387 = extractelement <4 x float> %tmp3, i32 0 @@ -1446,7 +1446,7 @@ ENDIF197: ; preds = %IF198, %IF189 %tmp467 = bitcast float %tmp220 to i32 %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0 %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1 - %tmp160.bc = bitcast <16 x i8> %tmp160 to <4 x i32> + %tmp160.bc = bitcast <4 x i32> %tmp160 to <4 x i32> %tmp469.bc = bitcast <2 x i32> %tmp469 to <2 x float> %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp469.bc, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp471 = extractelement <4 x float> %tmp470, i32 0 @@ -1465,7 +1465,7 @@ ENDIF197: ; preds = %IF198, %IF189 %tmp484 = bitcast float %p2.i138 to i32 %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0 %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1 - %tmp156.bc = bitcast <16 x i8> %tmp156 to <4 x i32> + %tmp156.bc = bitcast <4 x i32> %tmp156 to <4 x i32> %tmp486.bc = bitcast <2 x i32> %tmp486 to <2 x float> %tmp487 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp486.bc, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp488 = extractelement <4 x float> %tmp487, i32 0 @@ -1674,7 +1674,7 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %tmp657 = insertelement <4 x i32> %tmp656, i32 %tmp654, i32 1 %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2 %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3 - %tmp128.bc = bitcast <16 x i8> %tmp128 to <4 x i32> + %tmp128.bc = bitcast <4 x i32> %tmp128 to <4 x i32> %tmp659.bc = bitcast <4 x i32> %tmp659 to <4 x float> %tmp660 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp659.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp661 = extractelement <4 x float> %tmp660, i32 0 @@ -1869,7 +1869,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll index 926702645d9..2a8ced59dde 100644 --- a/test/CodeGen/AMDGPU/si-spill-cf.ll +++ b/test/CodeGen/AMDGPU/si-spill-cf.ll @@ -9,73 +9,73 @@ define amdgpu_ps void @main() #0 { main_body: - %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 16) - %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32) - %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80) - %tmp3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84) - %tmp4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88) - %tmp5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) - %tmp6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100) - %tmp7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104) - %tmp8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112) - %tmp9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116) - %tmp10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) - %tmp11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128) - %tmp12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132) - %tmp13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136) - %tmp14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144) - %tmp15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148) - %tmp16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152) - %tmp17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160) - %tmp18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164) - %tmp19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168) - %tmp20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176) - %tmp21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180) - %tmp22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184) - %tmp23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192) - %tmp24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196) - %tmp25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200) - %tmp26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208) - %tmp27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212) - %tmp28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216) - %tmp29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224) - %tmp30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228) - %tmp31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232) - %tmp32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240) - %tmp33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244) - %tmp34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248) - %tmp35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256) - %tmp36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260) - %tmp37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264) - %tmp38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272) - %tmp39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276) - %tmp40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280) - %tmp41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288) - %tmp42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292) - %tmp43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296) - %tmp44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304) - %tmp45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308) - %tmp46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312) - %tmp47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320) - %tmp48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324) - %tmp49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328) - %tmp50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336) - %tmp51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340) - %tmp52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344) - %tmp53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352) - %tmp54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356) - %tmp55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360) - %tmp56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368) - %tmp57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372) - %tmp58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376) - %tmp59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384) - %tmp60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388) - %tmp61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392) - %tmp62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400) - %tmp63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404) - %tmp64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408) - %tmp65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416) - %tmp66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420) + %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 16) + %tmp1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 32) + %tmp2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 80) + %tmp3 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 84) + %tmp4 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 88) + %tmp5 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 96) + %tmp6 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 100) + %tmp7 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 104) + %tmp8 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 112) + %tmp9 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 116) + %tmp10 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 120) + %tmp11 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 128) + %tmp12 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 132) + %tmp13 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 136) + %tmp14 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 144) + %tmp15 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 148) + %tmp16 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 152) + %tmp17 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 160) + %tmp18 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 164) + %tmp19 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 168) + %tmp20 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 176) + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 180) + %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 184) + %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 192) + %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 196) + %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 200) + %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 208) + %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 212) + %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 216) + %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 224) + %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 228) + %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 232) + %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 240) + %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 244) + %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 248) + %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 256) + %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 260) + %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 264) + %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 272) + %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 276) + %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 280) + %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 288) + %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 292) + %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 296) + %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 304) + %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 308) + %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 312) + %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 320) + %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 324) + %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 328) + %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 336) + %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 340) + %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 344) + %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 352) + %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 356) + %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 360) + %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 368) + %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 372) + %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 376) + %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 384) + %tmp60 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 388) + %tmp61 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 392) + %tmp62 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 400) + %tmp63 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 404) + %tmp64 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 408) + %tmp65 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 416) + %tmp66 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 420) br label %LOOP LOOP: ; preds = %ENDIF2795, %main_body @@ -497,7 +497,7 @@ declare float @llvm.minnum.f32(float, float) #1 declare float @llvm.maxnum.f32(float, float) #1 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll index 50f72c67059..3f1e1cacb87 100644 --- a/test/CodeGen/AMDGPU/smrd.ll +++ b/test/CodeGen/AMDGPU/smrd.ll @@ -84,34 +84,34 @@ entry: ret void } -; SMRD load using the load.const intrinsic with an immediate offset +; SMRD load using the load.const.v4i32 intrinsic with an immediate offset ; GCN-LABEL: {{^}}smrd_load_const0: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 -define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 - %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp - %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16) + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } -; SMRD load using the load.const intrinsic with the largest possible immediate +; SMRD load using the load.const.v4i32 intrinsic with the largest possible immediate ; offset. ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 - %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp - %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1020) + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1020) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } -; SMRD load using the load.const intrinsic with an offset greater than the +; SMRD load using the load.const.v4i32 intrinsic with an offset greater than the ; largets possible immediate. ; immediate offset. ; GCN-LABEL: {{^}}smrd_load_const2: @@ -119,11 +119,11 @@ main_body: ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 - %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp - %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1024) + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1024) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } @@ -134,11 +134,11 @@ main_body: ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 - %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp - %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048572) + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048572) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } @@ -149,17 +149,17 @@ main_body: ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 - %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp - %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048576) + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048576) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/split-smrd.ll b/test/CodeGen/AMDGPU/split-smrd.ll index cdb1b1e3b50..5fc69067760 100644 --- a/test/CodeGen/AMDGPU/split-smrd.ll +++ b/test/CodeGen/AMDGPU/split-smrd.ll @@ -8,7 +8,7 @@ ; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { bb: - %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) + %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 96) %tmp1 = bitcast float %tmp to i32 br i1 undef, label %bb2, label %bb3 @@ -31,7 +31,7 @@ bb3: ; preds = %bb declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index c9c8583d5e8..ca2366a361f 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -27,17 +27,17 @@ ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1536 -define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { +define amdgpu_vs void @main([9 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <4 x i32>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: - %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i64 0, i64 0 - %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0 - %tmp12 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 0) - %tmp13 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 16) - %tmp14 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 32) - %tmp15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0 - %tmp16 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp15, align 16, !tbaa !0 + %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg1, i64 0, i64 0 + %tmp11 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, align 16, !tbaa !0 + %tmp12 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 0) + %tmp13 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 16) + %tmp14 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 32) + %tmp15 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg4, i64 0, i64 0 + %tmp16 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp15, align 16, !tbaa !0 %tmp17 = add i32 %arg5, %arg7 - %tmp16.cast = bitcast <16 x i8> %tmp16 to <4 x i32> + %tmp16.cast = bitcast <4 x i32> %tmp16 to <4 x i32> %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i1 false, i1 false) %tmp19 = extractelement <4 x float> %tmp18, i32 0 %tmp20 = extractelement <4 x float> %tmp18, i32 1 @@ -488,7 +488,7 @@ bb157: ; preds = %bb24 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2 attributes #0 = { nounwind }