mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 19:52:54 +01:00
A few minor updates, removing implemented stuff and adding a couple of
new things. llvm-svn: 47458
This commit is contained in:
parent
b3c8d120dc
commit
123fc4b97d
@ -54,6 +54,17 @@ One better solution for 1LL << x is:
|
|||||||
|
|
||||||
But that requires good 8-bit subreg support.
|
But that requires good 8-bit subreg support.
|
||||||
|
|
||||||
|
Also, this might be better. It's an extra shift, but it's one instruction
|
||||||
|
shorter, and doesn't stress 8-bit subreg support.
|
||||||
|
(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
|
||||||
|
but without the unnecessary and.)
|
||||||
|
movl %ecx, %eax
|
||||||
|
shrl $5, %eax
|
||||||
|
movl %eax, %edx
|
||||||
|
xorl $1, %edx
|
||||||
|
sall %cl, %eax
|
||||||
|
sall %cl. %edx
|
||||||
|
|
||||||
64-bit shifts (in general) expand to really bad code. Instead of using
|
64-bit shifts (in general) expand to really bad code. Instead of using
|
||||||
cmovs, we should expand to a conditional branch like GCC produces.
|
cmovs, we should expand to a conditional branch like GCC produces.
|
||||||
|
|
||||||
@ -67,6 +78,9 @@ into:
|
|||||||
xorl $1, %eax
|
xorl $1, %eax
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
(Although note that this isn't a legal way to express the code that llvm-gcc
|
||||||
|
currently generates for that function.)
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
Some isel ideas:
|
Some isel ideas:
|
||||||
@ -94,34 +108,6 @@ the coalescer how to deal with it though.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
Count leading zeros and count trailing zeros:
|
|
||||||
|
|
||||||
int clz(int X) { return __builtin_clz(X); }
|
|
||||||
int ctz(int X) { return __builtin_ctz(X); }
|
|
||||||
|
|
||||||
$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
|
|
||||||
clz:
|
|
||||||
bsr %eax, DWORD PTR [%esp+4]
|
|
||||||
xor %eax, 31
|
|
||||||
ret
|
|
||||||
ctz:
|
|
||||||
bsf %eax, DWORD PTR [%esp+4]
|
|
||||||
ret
|
|
||||||
|
|
||||||
however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
|
|
||||||
aren't.
|
|
||||||
|
|
||||||
Another example (use predsimplify to eliminate a select):
|
|
||||||
|
|
||||||
int foo (unsigned long j) {
|
|
||||||
if (j)
|
|
||||||
return __builtin_ffs (j) - 1;
|
|
||||||
else
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
It appears icc use push for parameter passing. Need to investigate.
|
It appears icc use push for parameter passing. Need to investigate.
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
@ -236,32 +222,6 @@ which is probably slower, but it's interesting at least :)
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
The first BB of this code:
|
|
||||||
|
|
||||||
declare bool %foo()
|
|
||||||
int %bar() {
|
|
||||||
%V = call bool %foo()
|
|
||||||
br bool %V, label %T, label %F
|
|
||||||
T:
|
|
||||||
ret int 1
|
|
||||||
F:
|
|
||||||
call bool %foo()
|
|
||||||
ret int 12
|
|
||||||
}
|
|
||||||
|
|
||||||
compiles to:
|
|
||||||
|
|
||||||
_bar:
|
|
||||||
subl $12, %esp
|
|
||||||
call L_foo$stub
|
|
||||||
xorb $1, %al
|
|
||||||
testb %al, %al
|
|
||||||
jne LBB_bar_2 # F
|
|
||||||
|
|
||||||
It would be better to emit "cmp %al, 1" than a xor and test.
|
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
|
We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
|
||||||
We should leave these as libcalls for everything over a much lower threshold,
|
We should leave these as libcalls for everything over a much lower threshold,
|
||||||
since libc is hand tuned for medium and large mem ops (avoiding RFO for large
|
since libc is hand tuned for medium and large mem ops (avoiding RFO for large
|
||||||
@ -483,19 +443,24 @@ shorter than movl + leal.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
|
__builtin_ffs codegen is messy.
|
||||||
|
|
||||||
int ctz_(unsigned X) { return __builtin_ctz(X); }
|
|
||||||
int clz_(unsigned X) { return __builtin_clz(X); }
|
|
||||||
int ffs_(unsigned X) { return __builtin_ffs(X); }
|
int ffs_(unsigned X) { return __builtin_ffs(X); }
|
||||||
|
|
||||||
_ctz_:
|
llvm produces:
|
||||||
bsfl 4(%esp), %eax
|
ffs_:
|
||||||
ret
|
movl 4(%esp), %ecx
|
||||||
_clz_:
|
bsfl %ecx, %eax
|
||||||
bsrl 4(%esp), %eax
|
movl $32, %edx
|
||||||
xorl $31, %eax
|
cmove %edx, %eax
|
||||||
|
incl %eax
|
||||||
|
xorl %edx, %edx
|
||||||
|
testl %ecx, %ecx
|
||||||
|
cmove %edx, %eax
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
vs gcc:
|
||||||
|
|
||||||
_ffs_:
|
_ffs_:
|
||||||
movl $-1, %edx
|
movl $-1, %edx
|
||||||
bsfl 4(%esp), %eax
|
bsfl 4(%esp), %eax
|
||||||
@ -503,6 +468,15 @@ _ffs_:
|
|||||||
addl $1, %eax
|
addl $1, %eax
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
Another example of __builtin_ffs (use predsimplify to eliminate a select):
|
||||||
|
|
||||||
|
int foo (unsigned long j) {
|
||||||
|
if (j)
|
||||||
|
return __builtin_ffs (j) - 1;
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
It appears gcc place string data with linkonce linkage in
|
It appears gcc place string data with linkonce linkage in
|
||||||
@ -1062,6 +1036,8 @@ Should compile to:
|
|||||||
setae %al
|
setae %al
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
FIXME: That code looks wrong; bool return is normally defined as zext.
|
||||||
|
|
||||||
on x86-64, not:
|
on x86-64, not:
|
||||||
|
|
||||||
__Z11no_overflowjj:
|
__Z11no_overflowjj:
|
||||||
@ -1208,35 +1184,44 @@ void compare (long long foo) {
|
|||||||
|
|
||||||
to:
|
to:
|
||||||
|
|
||||||
_compare:
|
compare:
|
||||||
subl $12, %esp
|
subl $4, %esp
|
||||||
cmpl $0, 16(%esp)
|
cmpl $0, 8(%esp)
|
||||||
setne %al
|
setne %al
|
||||||
movzbw %al, %ax
|
movzbw %al, %ax
|
||||||
cmpl $1, 20(%esp)
|
cmpl $1, 12(%esp)
|
||||||
setg %cl
|
setg %cl
|
||||||
movzbw %cl, %cx
|
movzbw %cl, %cx
|
||||||
cmove %ax, %cx
|
cmove %ax, %cx
|
||||||
movw %cx, %ax
|
testb $1, %cl
|
||||||
testb $1, %al
|
jne .LBB1_2 # UnifiedReturnBlock
|
||||||
je LBB1_2 # cond_true
|
.LBB1_1: # ifthen
|
||||||
|
call abort
|
||||||
|
.LBB1_2: # UnifiedReturnBlock
|
||||||
|
addl $4, %esp
|
||||||
|
ret
|
||||||
|
|
||||||
(also really horrible code on ppc). This is due to the expand code for 64-bit
|
(also really horrible code on ppc). This is due to the expand code for 64-bit
|
||||||
compares. GCC produces multiple branches, which is much nicer:
|
compares. GCC produces multiple branches, which is much nicer:
|
||||||
|
|
||||||
_compare:
|
compare:
|
||||||
pushl %ebp
|
subl $12, %esp
|
||||||
movl %esp, %ebp
|
movl 20(%esp), %edx
|
||||||
subl $8, %esp
|
movl 16(%esp), %eax
|
||||||
movl 8(%ebp), %eax
|
decl %edx
|
||||||
movl 12(%ebp), %edx
|
jle .L7
|
||||||
subl $1, %edx
|
.L5:
|
||||||
jg L5
|
addl $12, %esp
|
||||||
L7:
|
ret
|
||||||
jl L4
|
.p2align 4,,7
|
||||||
|
.L7:
|
||||||
|
jl .L4
|
||||||
cmpl $0, %eax
|
cmpl $0, %eax
|
||||||
jbe L4
|
.p2align 4,,8
|
||||||
L5:
|
ja .L5
|
||||||
|
.L4:
|
||||||
|
.p2align 4,,9
|
||||||
|
call abort
|
||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
@ -1380,7 +1365,7 @@ Should compile into:
|
|||||||
|
|
||||||
_foo:
|
_foo:
|
||||||
movzwl 4(%esp), %eax
|
movzwl 4(%esp), %eax
|
||||||
orb $-1, %al ;; 'orl 255' is also fine :)
|
orl $255, %eax
|
||||||
ret
|
ret
|
||||||
|
|
||||||
instead of:
|
instead of:
|
||||||
@ -1550,6 +1535,48 @@ See PR2053 for more details.
|
|||||||
|
|
||||||
//===---------------------------------------------------------------------===//
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
We should investigate using cdq/ctld (effect: edx = sar eax, 31)
|
||||||
|
more aggressively; it should cost the same as a move+shift on any modern
|
||||||
|
processor, but it's a lot shorter. Downside is that it puts more
|
||||||
|
pressure on register allocation because it has fixed operands.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
int abs(int x) {return x < 0 ? -x : x;}
|
||||||
|
|
||||||
|
gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
|
||||||
|
abs:
|
||||||
|
movl 4(%esp), %eax
|
||||||
|
cltd
|
||||||
|
xorl %edx, %eax
|
||||||
|
subl %edx, %eax
|
||||||
|
ret
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
Consider:
|
||||||
|
|
||||||
|
#include <inttypes.h>
|
||||||
|
uint64_t a;
|
||||||
|
uint16_t b;
|
||||||
|
uint64_t mul(void) {
|
||||||
|
return a * b;
|
||||||
|
}
|
||||||
|
|
||||||
|
Currently, we generate the following:
|
||||||
|
|
||||||
|
mul:
|
||||||
|
movzwl b, %ecx
|
||||||
|
movl %ecx, %eax
|
||||||
|
mull a
|
||||||
|
imull a+4, %ecx
|
||||||
|
addl %edx, %ecx
|
||||||
|
movl %ecx, %edx
|
||||||
|
ret
|
||||||
|
|
||||||
|
llvm should be able to commute the addl so that the movl isn't necessary.
|
||||||
|
|
||||||
|
//===---------------------------------------------------------------------===//
|
||||||
|
|
||||||
Consider:
|
Consider:
|
||||||
int test(unsigned long a, unsigned long b) { return -(a < b); }
|
int test(unsigned long a, unsigned long b) { return -(a < b); }
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user