1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00
llvm-mirror/test/CodeGen/X86/win-alloca-expander.ll
Kyle Butt 96c1e7e4f0 Codegen: Make chains from trellis-shaped CFGs
Lay out trellis-shaped CFGs optimally.
A trellis of the shape below:

  A     B
  |\   /|
  | \ / |
  |  X  |
  | / \ |
  |/   \|
  C     D

would be laid out A; B->C ; D by the current layout algorithm. Now we identify
trellises and lay them out either A->C; B->D or A->D; B->C. This scales with an
increasing number of predecessors. A trellis is a a group of 2 or more
predecessor blocks that all have the same successors.

because of this we can tail duplicate to extend existing trellises.

As an example consider the following CFG:

    B   D   F   H
   / \ / \ / \ / \
  A---C---E---G---Ret

Where A,C,E,G are all small (Currently 2 instructions).

The CFG preserving layout is then A,B,C,D,E,F,G,H,Ret.

The current code will copy C into B, E into D and G into F and yield the layout
A,C,B(C),E,D(E),F(G),G,H,ret

define void @straight_test(i32 %tag) {
entry:
  br label %test1
test1: ; A
  %tagbit1 = and i32 %tag, 1
  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
  br i1 %tagbit1eq0, label %test2, label %optional1
optional1: ; B
  call void @a()
  br label %test2
test2: ; C
  %tagbit2 = and i32 %tag, 2
  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
  br i1 %tagbit2eq0, label %test3, label %optional2
optional2: ; D
  call void @b()
  br label %test3
test3: ; E
  %tagbit3 = and i32 %tag, 4
  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
  br i1 %tagbit3eq0, label %test4, label %optional3
optional3: ; F
  call void @c()
  br label %test4
test4: ; G
  %tagbit4 = and i32 %tag, 8
  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
  br i1 %tagbit4eq0, label %exit, label %optional4
optional4: ; H
  call void @d()
  br label %exit
exit:
  ret void
}

here is the layout after D27742:
straight_test:                          # @straight_test
; ... Prologue elided
; BB#0:                                 # %entry ; A (merged with test1)
; ... More prologue elided
	mr 30, 3
	andi. 3, 30, 1
	bc 12, 1, .LBB0_2
; BB#1:                                 # %test2 ; C
	rlwinm. 3, 30, 0, 30, 30
	beq	 0, .LBB0_3
	b .LBB0_4
.LBB0_2:                                # %optional1 ; B (copy of C)
	bl a
	nop
	rlwinm. 3, 30, 0, 30, 30
	bne	 0, .LBB0_4
.LBB0_3:                                # %test3 ; E
	rlwinm. 3, 30, 0, 29, 29
	beq	 0, .LBB0_5
	b .LBB0_6
.LBB0_4:                                # %optional2 ; D (copy of E)
	bl b
	nop
	rlwinm. 3, 30, 0, 29, 29
	bne	 0, .LBB0_6
.LBB0_5:                                # %test4 ; G
	rlwinm. 3, 30, 0, 28, 28
	beq	 0, .LBB0_8
	b .LBB0_7
.LBB0_6:                                # %optional3 ; F (copy of G)
	bl c
	nop
	rlwinm. 3, 30, 0, 28, 28
	beq	 0, .LBB0_8
.LBB0_7:                                # %optional4 ; H
	bl d
	nop
.LBB0_8:                                # %exit ; Ret
	ld 30, 96(1)                    # 8-byte Folded Reload
	addi 1, 1, 112
	ld 0, 16(1)
	mtlr 0
	blr

The tail-duplication has produced some benefit, but it has also produced a
trellis which is not laid out optimally. With this patch, we improve the layouts
of such trellises, and decrease the cost calculation for tail-duplication
accordingly.

This patch produces the layout A,C,E,G,B,D,F,H,Ret. This layout does have
back edges, which is a negative, but it has a bigger compensating
positive, which is that it handles the case where there are long strings
of skipped blocks much better than the original layout. Both layouts
handle runs of executed blocks equally well. Branch prediction also
improves if there is any correlation between subsequent optional blocks.

Here is the resulting concrete layout:

straight_test:                          # @straight_test
; BB#0:                                 # %entry ; A (merged with test1)
	mr 30, 3
	andi. 3, 30, 1
	bc 12, 1, .LBB0_4
; BB#1:                                 # %test2 ; C
	rlwinm. 3, 30, 0, 30, 30
	bne	 0, .LBB0_5
.LBB0_2:                                # %test3 ; E
	rlwinm. 3, 30, 0, 29, 29
	bne	 0, .LBB0_6
.LBB0_3:                                # %test4 ; G
	rlwinm. 3, 30, 0, 28, 28
	bne	 0, .LBB0_7
	b .LBB0_8
.LBB0_4:                                # %optional1 ; B (Copy of C)
	bl a
	nop
	rlwinm. 3, 30, 0, 30, 30
	beq	 0, .LBB0_2
.LBB0_5:                                # %optional2 ; D (Copy of E)
	bl b
	nop
	rlwinm. 3, 30, 0, 29, 29
	beq	 0, .LBB0_3
.LBB0_6:                                # %optional3 ; F (Copy of G)
	bl c
	nop
	rlwinm. 3, 30, 0, 28, 28
	beq	 0, .LBB0_8
.LBB0_7:                                # %optional4 ; H
	bl d
	nop
.LBB0_8:                                # %exit

Differential Revision: https://reviews.llvm.org/D28522

llvm-svn: 295223
2017-02-15 19:49:14 +00:00

157 lines
3.5 KiB
LLVM

; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
; RUN: llc < %s -mtriple=i686-pc-win32 -O0
%struct.S = type { [1024 x i8] }
%struct.T = type { [3000 x i8] }
%struct.U = type { [10000 x i8] }
define void @basics() {
; CHECK-LABEL: basics:
entry:
br label %bb1
; Allocation move sizes should have been removed.
; CHECK-NOT: movl $1024
; CHECK-NOT: movl $3000
bb1:
%p0 = alloca %struct.S
; The allocation is small enough not to require stack probing, but the %esp
; offset after the prologue is not known, so the stack must be touched before
; the pointer is adjusted.
; CHECK: pushl %eax
; CHECK: subl $1020, %esp
%saved_stack = tail call i8* @llvm.stacksave()
%p1 = alloca %struct.S
; We know the %esp offset from above, so there is no need to touch the stack
; before adjusting it.
; CHECK: subl $1024, %esp
%p2 = alloca %struct.T
; The offset is now 2048 bytes, so allocating a T must touch the stack again.
; CHECK: pushl %eax
; CHECK: subl $2996, %esp
call void @f(%struct.S* %p0)
; CHECK: calll
%p3 = alloca %struct.T
; The call above touched the stack, so there is room for a T object.
; CHECK: subl $3000, %esp
%p4 = alloca %struct.U
; The U object is large enough to require stack probing.
; CHECK: movl $10000, %eax
; CHECK: calll __chkstk
%p5 = alloca %struct.T
; The stack probing above touched the tip of the stack, so there's room for a T.
; CHECK: subl $3000, %esp
call void @llvm.stackrestore(i8* %saved_stack)
%p6 = alloca %struct.S
; The stack restore means we lose track of the stack pointer and must probe.
; CHECK: pushl %eax
; CHECK: subl $1020, %esp
; Use the pointers so they're not optimized away.
call void @f(%struct.S* %p1)
call void @g(%struct.T* %p2)
call void @g(%struct.T* %p3)
call void @h(%struct.U* %p4)
call void @g(%struct.T* %p5)
ret void
}
define void @loop() {
; CHECK-LABEL: loop:
entry:
br label %bb1
bb1:
%p1 = alloca %struct.S
; The entry offset is unknown; touch-and-sub.
; CHECK: pushl %eax
; CHECK: subl $1020, %esp
br label %loop1
loop1:
%i1 = phi i32 [ 10, %bb1 ], [ %dec1, %loop1 ]
%p2 = alloca %struct.S
; We know the incoming offset from bb1, but from the back-edge, we assume the
; worst, and therefore touch-and-sub to allocate.
; CHECK: pushl %eax
; CHECK: subl $1020, %esp
%dec1 = sub i32 %i1, 1
%cmp1 = icmp sgt i32 %i1, 0
br i1 %cmp1, label %loop1, label %end
; CHECK: decl
; CHECK: jg
end:
call void @f(%struct.S* %p1)
call void @f(%struct.S* %p2)
ret void
}
define void @probe_size_attribute() "stack-probe-size"="512" {
; CHECK-LABEL: probe_size_attribute:
entry:
br label %bb1
bb1:
%p0 = alloca %struct.S
; The allocation would be small enough not to require probing, if it wasn't
; for the stack-probe-size attribute.
; CHECK: movl $1024, %eax
; CHECK: calll __chkstk
call void @f(%struct.S* %p0)
ret void
}
define void @cfg(i1 %x, i1 %y) {
; Test that the blocks are analyzed in the correct order.
; CHECK-LABEL: cfg:
entry:
br i1 %x, label %bb1, label %bb3
bb1:
%p1 = alloca %struct.S
; CHECK: pushl %eax
; CHECK: subl $1020, %esp
br label %bb4
bb2:
%p5 = alloca %struct.T
; CHECK: pushl %eax
; CHECK: subl $2996, %esp
call void @g(%struct.T* %p5)
ret void
bb3:
%p2 = alloca %struct.T
; CHECK: pushl %eax
; CHECK: subl $2996, %esp
br label %bb4
bb4:
br i1 %y, label %bb5, label %bb2
bb5:
%p4 = alloca %struct.S
; CHECK: subl $1024, %esp
call void @f(%struct.S* %p4)
ret void
}
declare void @f(%struct.S*)
declare void @g(%struct.T*)
declare void @h(%struct.U*)
declare i8* @llvm.stacksave()
declare void @llvm.stackrestore(i8*)