A few minor updates, removing implemented stuff and adding a couple of

new things. llvm-svn: 47458
2024-11-24 19:52:54 +01:00 · 2008-02-21 21:16:49 +00:00 · 2008-02-21 21:16:49 +00:00 · 123fc4b97d
commit 123fc4b97d
parent b3c8d120dc
1 changed files with 110 additions and 83 deletions
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@ -54,6 +54,17 @@ One better solution for 1LL << x is:
 But that requires good 8-bit subreg support.
 Also, this might be better.  It's an extra shift, but it's one instruction
 shorter, and doesn't stress 8-bit subreg support.
 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
 but without the unnecessary and.)
        movl %ecx, %eax
        shrl $5, %eax
        movl %eax, %edx
        xorl $1, %edx
        sall %cl, %eax
        sall %cl. %edx
 64-bit shifts (in general) expand to really bad code.  Instead of using
 cmovs, we should expand to a conditional branch like GCC produces.
@ -67,6 +78,9 @@ into:
        xorl    $1, %eax
        ret
 (Although note that this isn't a legal way to express the code that llvm-gcc
 currently generates for that function.)
 //===---------------------------------------------------------------------===//
 Some isel ideas:
@ -94,34 +108,6 @@ the coalescer how to deal with it though.
 //===---------------------------------------------------------------------===//
 Count leading zeros and count trailing zeros:
 int clz(int X) { return __builtin_clz(X); }
 int ctz(int X) { return __builtin_ctz(X); }
 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 clz:
        bsr     %eax, DWORD PTR [%esp+4]
        xor     %eax, 31
        ret
 ctz:
        bsf     %eax, DWORD PTR [%esp+4]
        ret
 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 aren't.
 Another example (use predsimplify to eliminate a select):
 int foo (unsigned long j) {
  if (j)
    return __builtin_ffs (j) - 1;
  else
    return 0;
 }
 //===---------------------------------------------------------------------===//
 It appears icc use push for parameter passing. Need to investigate.
 //===---------------------------------------------------------------------===//
@ -236,32 +222,6 @@ which is probably slower, but it's interesting at least :)
 //===---------------------------------------------------------------------===//
 The first BB of this code:
 declare bool %foo()
 int %bar() {
        %V = call bool %foo()
        br bool %V, label %T, label %F
 T:
        ret int 1
 F:
        call bool %foo()
        ret int 12
 }
 compiles to:
 _bar:
        subl $12, %esp
        call L_foo$stub
        xorb $1, %al
        testb %al, %al
        jne LBB_bar_2   # F
 It would be better to emit "cmp %al, 1" than a xor and test.
 //===---------------------------------------------------------------------===//
 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 We should leave these as libcalls for everything over a much lower threshold,
 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
@ -483,19 +443,24 @@ shorter than movl + leal.
 //===---------------------------------------------------------------------===//
-Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
+__builtin_ffs codegen is messy.
 int ctz_(unsigned X) { return __builtin_ctz(X); }
 int clz_(unsigned X) { return __builtin_clz(X); }
 int ffs_(unsigned X) { return __builtin_ffs(X); }
-_ctz_:
+llvm produces:
-        bsfl    4(%esp), %eax
+ffs_:
-        ret
+        movl    4(%esp), %ecx
-_clz_:
+        bsfl    %ecx, %eax
-        bsrl    4(%esp), %eax
+        movl    $32, %edx
-        xorl    $31, %eax
+        cmove   %edx, %eax
        incl    %eax
        xorl    %edx, %edx
        testl   %ecx, %ecx
        cmove   %edx, %eax
        ret
 vs gcc:
 _ffs_:
        movl    $-1, %edx
        bsfl    4(%esp), %eax
@ -503,6 +468,15 @@ _ffs_:
        addl    $1, %eax
        ret
 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 int foo (unsigned long j) {
  if (j)
    return __builtin_ffs (j) - 1;
  else
    return 0;
 }
 //===---------------------------------------------------------------------===//
 It appears gcc place string data with linkonce linkage in
@ -1062,6 +1036,8 @@ Should compile to:
                setae   %al
                ret
 FIXME: That code looks wrong; bool return is normally defined as zext.
 on x86-64, not:
 __Z11no_overflowjj:
@ -1208,35 +1184,44 @@ void compare (long long foo) {
 to:
-_compare:
+compare:
-        subl    $12, %esp
+        subl    $4, %esp
-        cmpl    $0, 16(%esp)
+        cmpl    $0, 8(%esp)
        setne   %al
        movzbw  %al, %ax
-        cmpl    $1, 20(%esp)
+        cmpl    $1, 12(%esp)
        setg    %cl
        movzbw  %cl, %cx
        cmove   %ax, %cx
-        movw    %cx, %ax
+        testb   $1, %cl
-        testb   $1, %al
+        jne     .LBB1_2 # UnifiedReturnBlock
-        je      LBB1_2  # cond_true
+.LBB1_1:        # ifthen
        call    abort
 .LBB1_2:        # UnifiedReturnBlock
        addl    $4, %esp
        ret
 (also really horrible code on ppc).  This is due to the expand code for 64-bit
 compares.  GCC produces multiple branches, which is much nicer:
-_compare:
+compare:
-        pushl   %ebp
+        subl    $12, %esp
-        movl    %esp, %ebp
+        movl    20(%esp), %edx
-        subl    $8, %esp
+        movl    16(%esp), %eax
-        movl    8(%ebp), %eax
+        decl    %edx
-        movl    12(%ebp), %edx
+        jle     .L7
-        subl    $1, %edx
+.L5:
-        jg     L5
+        addl    $12, %esp
-L7:
+        ret
-        jl      L4
+        .p2align 4,,7
 .L7:
        jl      .L4
        cmpl    $0, %eax
-        jbe      L4
+        .p2align 4,,8
-L5:
+        ja      .L5
 .L4:
        .p2align 4,,9
        call    abort
 //===---------------------------------------------------------------------===//
@ -1380,7 +1365,7 @@ Should compile into:
 _foo:
        movzwl  4(%esp), %eax
-        orb     $-1, %al           ;; 'orl 255' is also fine :)
+        orl     $255, %eax
        ret
 instead of:
@ -1550,6 +1535,48 @@ See PR2053 for more details.
 //===---------------------------------------------------------------------===//
 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
 more aggressively; it should cost the same as a move+shift on any modern
 processor, but it's a lot shorter. Downside is that it puts more
 pressure on register allocation because it has fixed operands.
 Example:
 int abs(int x) {return x < 0 ? -x : x;}
 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
 abs:
        movl    4(%esp), %eax
        cltd
        xorl    %edx, %eax
        subl    %edx, %eax
        ret
 //===---------------------------------------------------------------------===//
 Consider:
 #include <inttypes.h>
 uint64_t a;
 uint16_t b;
 uint64_t mul(void) {
  return a * b;
 }
 Currently, we generate the following:
 mul:
        movzwl  b, %ecx
        movl    %ecx, %eax
        mull    a
        imull   a+4, %ecx
        addl    %edx, %ecx
        movl    %ecx, %edx
        ret
 llvm should be able to commute the addl so that the movl isn't necessary.
 //===---------------------------------------------------------------------===//
 Consider:
 int test(unsigned long a, unsigned long b) { return -(a < b); }