1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-24 19:52:54 +01:00

A few minor updates, removing implemented stuff and adding a couple of

new things.

llvm-svn: 47458
This commit is contained in:
Eli Friedman 2008-02-21 21:16:49 +00:00
parent b3c8d120dc
commit 123fc4b97d

View File

@ -54,6 +54,17 @@ One better solution for 1LL << x is:
But that requires good 8-bit subreg support. But that requires good 8-bit subreg support.
Also, this might be better. It's an extra shift, but it's one instruction
shorter, and doesn't stress 8-bit subreg support.
(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
but without the unnecessary and.)
movl %ecx, %eax
shrl $5, %eax
movl %eax, %edx
xorl $1, %edx
sall %cl, %eax
sall %cl. %edx
64-bit shifts (in general) expand to really bad code. Instead of using 64-bit shifts (in general) expand to really bad code. Instead of using
cmovs, we should expand to a conditional branch like GCC produces. cmovs, we should expand to a conditional branch like GCC produces.
@ -67,6 +78,9 @@ into:
xorl $1, %eax xorl $1, %eax
ret ret
(Although note that this isn't a legal way to express the code that llvm-gcc
currently generates for that function.)
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//
Some isel ideas: Some isel ideas:
@ -94,34 +108,6 @@ the coalescer how to deal with it though.
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//
Count leading zeros and count trailing zeros:
int clz(int X) { return __builtin_clz(X); }
int ctz(int X) { return __builtin_ctz(X); }
$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
clz:
bsr %eax, DWORD PTR [%esp+4]
xor %eax, 31
ret
ctz:
bsf %eax, DWORD PTR [%esp+4]
ret
however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
aren't.
Another example (use predsimplify to eliminate a select):
int foo (unsigned long j) {
if (j)
return __builtin_ffs (j) - 1;
else
return 0;
}
//===---------------------------------------------------------------------===//
It appears icc use push for parameter passing. Need to investigate. It appears icc use push for parameter passing. Need to investigate.
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//
@ -236,32 +222,6 @@ which is probably slower, but it's interesting at least :)
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//
The first BB of this code:
declare bool %foo()
int %bar() {
%V = call bool %foo()
br bool %V, label %T, label %F
T:
ret int 1
F:
call bool %foo()
ret int 12
}
compiles to:
_bar:
subl $12, %esp
call L_foo$stub
xorb $1, %al
testb %al, %al
jne LBB_bar_2 # F
It would be better to emit "cmp %al, 1" than a xor and test.
//===---------------------------------------------------------------------===//
We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
We should leave these as libcalls for everything over a much lower threshold, We should leave these as libcalls for everything over a much lower threshold,
since libc is hand tuned for medium and large mem ops (avoiding RFO for large since libc is hand tuned for medium and large mem ops (avoiding RFO for large
@ -483,19 +443,24 @@ shorter than movl + leal.
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//
Implement CTTZ, CTLZ with bsf and bsr. GCC produces: __builtin_ffs codegen is messy.
int ctz_(unsigned X) { return __builtin_ctz(X); }
int clz_(unsigned X) { return __builtin_clz(X); }
int ffs_(unsigned X) { return __builtin_ffs(X); } int ffs_(unsigned X) { return __builtin_ffs(X); }
_ctz_: llvm produces:
bsfl 4(%esp), %eax ffs_:
ret movl 4(%esp), %ecx
_clz_: bsfl %ecx, %eax
bsrl 4(%esp), %eax movl $32, %edx
xorl $31, %eax cmove %edx, %eax
incl %eax
xorl %edx, %edx
testl %ecx, %ecx
cmove %edx, %eax
ret ret
vs gcc:
_ffs_: _ffs_:
movl $-1, %edx movl $-1, %edx
bsfl 4(%esp), %eax bsfl 4(%esp), %eax
@ -503,6 +468,15 @@ _ffs_:
addl $1, %eax addl $1, %eax
ret ret
Another example of __builtin_ffs (use predsimplify to eliminate a select):
int foo (unsigned long j) {
if (j)
return __builtin_ffs (j) - 1;
else
return 0;
}
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//
It appears gcc place string data with linkonce linkage in It appears gcc place string data with linkonce linkage in
@ -1062,6 +1036,8 @@ Should compile to:
setae %al setae %al
ret ret
FIXME: That code looks wrong; bool return is normally defined as zext.
on x86-64, not: on x86-64, not:
__Z11no_overflowjj: __Z11no_overflowjj:
@ -1208,35 +1184,44 @@ void compare (long long foo) {
to: to:
_compare: compare:
subl $12, %esp subl $4, %esp
cmpl $0, 16(%esp) cmpl $0, 8(%esp)
setne %al setne %al
movzbw %al, %ax movzbw %al, %ax
cmpl $1, 20(%esp) cmpl $1, 12(%esp)
setg %cl setg %cl
movzbw %cl, %cx movzbw %cl, %cx
cmove %ax, %cx cmove %ax, %cx
movw %cx, %ax testb $1, %cl
testb $1, %al jne .LBB1_2 # UnifiedReturnBlock
je LBB1_2 # cond_true .LBB1_1: # ifthen
call abort
.LBB1_2: # UnifiedReturnBlock
addl $4, %esp
ret
(also really horrible code on ppc). This is due to the expand code for 64-bit (also really horrible code on ppc). This is due to the expand code for 64-bit
compares. GCC produces multiple branches, which is much nicer: compares. GCC produces multiple branches, which is much nicer:
_compare: compare:
pushl %ebp subl $12, %esp
movl %esp, %ebp movl 20(%esp), %edx
subl $8, %esp movl 16(%esp), %eax
movl 8(%ebp), %eax decl %edx
movl 12(%ebp), %edx jle .L7
subl $1, %edx .L5:
jg L5 addl $12, %esp
L7: ret
jl L4 .p2align 4,,7
.L7:
jl .L4
cmpl $0, %eax cmpl $0, %eax
jbe L4 .p2align 4,,8
L5: ja .L5
.L4:
.p2align 4,,9
call abort
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//
@ -1380,7 +1365,7 @@ Should compile into:
_foo: _foo:
movzwl 4(%esp), %eax movzwl 4(%esp), %eax
orb $-1, %al ;; 'orl 255' is also fine :) orl $255, %eax
ret ret
instead of: instead of:
@ -1550,6 +1535,48 @@ See PR2053 for more details.
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//
We should investigate using cdq/ctld (effect: edx = sar eax, 31)
more aggressively; it should cost the same as a move+shift on any modern
processor, but it's a lot shorter. Downside is that it puts more
pressure on register allocation because it has fixed operands.
Example:
int abs(int x) {return x < 0 ? -x : x;}
gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
abs:
movl 4(%esp), %eax
cltd
xorl %edx, %eax
subl %edx, %eax
ret
//===---------------------------------------------------------------------===//
Consider:
#include <inttypes.h>
uint64_t a;
uint16_t b;
uint64_t mul(void) {
return a * b;
}
Currently, we generate the following:
mul:
movzwl b, %ecx
movl %ecx, %eax
mull a
imull a+4, %ecx
addl %edx, %ecx
movl %ecx, %edx
ret
llvm should be able to commute the addl so that the movl isn't necessary.
//===---------------------------------------------------------------------===//
Consider: Consider:
int test(unsigned long a, unsigned long b) { return -(a < b); } int test(unsigned long a, unsigned long b) { return -(a < b); }