mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
53516796c3
This is an optimized approach for D94155. Previous code build the model that tile config register is the user of each AMX instruction. There is a problem for the tile config register spill. When across function, the ldtilecfg instruction may be inserted on each AMX instruction which use tile config register. This cause all tile data register clobber. To fix this issue, we remove the model of tile config register. Instead, we analyze the AMX instructions between one call to another. We will insert ldtilecfg after the first call if we find any AMX instructions. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D95136
124 lines
6.0 KiB
LLVM
124 lines
6.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
|
|
@buf = dso_local global [3072 x i8] zeroinitializer, align 16
|
|
|
|
define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind {
|
|
; CHECK-LABEL: test1:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
|
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movl $buf, %eax
|
|
; CHECK-NEXT: movl $32, %ecx
|
|
; CHECK-NEXT: movw $8, %dx
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0
|
|
; CHECK-NEXT: movl $buf+1024, %eax
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1
|
|
; CHECK-NEXT: movl $buf+2048, %eax
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
|
|
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
|
|
; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx)
|
|
; CHECK-NEXT: tilerelease
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: jmp foo # TAILCALL
|
|
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
|
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
|
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
|
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
|
|
tail call void @foo()
|
|
ret void
|
|
}
|
|
|
|
define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
|
|
; CHECK-LABEL: test2:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: pushq %rbp
|
|
; CHECK-NEXT: pushq %rbx
|
|
; CHECK-NEXT: subq $72, %rsp
|
|
; CHECK-NEXT: movl %esi, %ebx
|
|
; CHECK-NEXT: movl %edi, %ebp
|
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
|
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: callq foo
|
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: testb %al, %al
|
|
; CHECK-NEXT: jne .LBB1_3
|
|
; CHECK-NEXT: # %bb.1: # %if.true
|
|
; CHECK-NEXT: movw $8, %ax
|
|
; CHECK-NEXT: tilezero %tmm0
|
|
; CHECK-NEXT: movl $32, %ecx
|
|
; CHECK-NEXT: movl $buf+1024, %edx
|
|
; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1
|
|
; CHECK-NEXT: movl $buf+2048, %edx
|
|
; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2
|
|
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
|
|
; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx)
|
|
; CHECK-NEXT: jmp .LBB1_2
|
|
; CHECK-NEXT: .LBB1_3: # %if.false
|
|
; CHECK-NEXT: movl $buf, %eax
|
|
; CHECK-NEXT: movl $32, %ecx
|
|
; CHECK-NEXT: movw $8, %dx
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3
|
|
; CHECK-NEXT: movl $buf+1024, %eax
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4
|
|
; CHECK-NEXT: movl $buf+2048, %eax
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
|
|
; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3
|
|
; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx)
|
|
; CHECK-NEXT: .LBB1_2: # %if.true
|
|
; CHECK-NEXT: addq $72, %rsp
|
|
; CHECK-NEXT: popq %rbx
|
|
; CHECK-NEXT: popq %rbp
|
|
; CHECK-NEXT: tilerelease
|
|
; CHECK-NEXT: retq
|
|
call void @foo()
|
|
br i1 undef, label %if.true, label %if.false
|
|
|
|
if.true:
|
|
%t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
|
|
%t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
|
%t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
|
%t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
|
|
br label %exit
|
|
|
|
if.false:
|
|
%t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
|
%t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
|
%t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
|
%t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
|
|
br label %exit
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
declare dso_local void @foo() nounwind
|
|
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
|
|
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
|
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
|
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
|