llvm-mirror/test/CodeGen/X86/store-narrow.ll

; rdar://7860110
; RUN: llc -asm-verbose=false < %s | FileCheck %s -check-prefix=X64
; RUN: llc -march=x86 -asm-verbose=false < %s | FileCheck %s -check-prefix=X32
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.2"

define void @test1(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {
entry:
  %A = load i32* %a0, align 4
  %B = and i32 %A, -256     ; 0xFFFFFF00
  %C = zext i8 %a1 to i32
  %D = or i32 %C, %B
  store i32 %D, i32* %a0, align 4
  ret void
  
; X64: test1:
; X64: movb	%sil, (%rdi)

; X32: test1:
; X32: movb	8(%esp), %al
; X32: movb	%al, (%{{.*}})
}

define void @test2(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {
entry:
  %A = load i32* %a0, align 4
  %B = and i32 %A, -65281    ; 0xFFFF00FF
  %C = zext i8 %a1 to i32
  %CS = shl i32 %C, 8
  %D = or i32 %B, %CS
  store i32 %D, i32* %a0, align 4
  ret void
; X64: test2:
; X64: movb	%sil, 1(%rdi)

; X32: test2:
; X32: movb	8(%esp), %al
; X32: movb	%al, 1(%{{.*}})
}

define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
entry:
  %A = load i32* %a0, align 4
  %B = and i32 %A, -65536    ; 0xFFFF0000
  %C = zext i16 %a1 to i32
  %D = or i32 %B, %C
  store i32 %D, i32* %a0, align 4
  ret void
; X64: test3:
; X64: movw	%si, (%rdi)

; X32: test3:
; X32: movw	8(%esp), %ax
; X32: movw	%ax, (%{{.*}})
}

define void @test4(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
entry:
  %A = load i32* %a0, align 4
  %B = and i32 %A, 65535    ; 0x0000FFFF
  %C = zext i16 %a1 to i32
  %CS = shl i32 %C, 16
  %D = or i32 %B, %CS
  store i32 %D, i32* %a0, align 4
  ret void
; X64: test4:
; X64: movw	%si, 2(%rdi)

; X32: test4:
; X32: movl	8(%esp), %eax
; X32: movw	%ax, 2(%{{.*}})
}

define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
entry:
  %A = load i64* %a0, align 4
  %B = and i64 %A, -4294901761    ; 0xFFFFFFFF0000FFFF
  %C = zext i16 %a1 to i64
  %CS = shl i64 %C, 16
  %D = or i64 %B, %CS
  store i64 %D, i64* %a0, align 4
  ret void
; X64: test5:
; X64: movw	%si, 2(%rdi)

; X32: test5:
; X32: movzwl	8(%esp), %eax
; X32: movw	%ax, 2(%{{.*}})
}

define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {
entry:
  %A = load i64* %a0, align 4
  %B = and i64 %A, -280375465082881    ; 0xFFFF00FFFFFFFFFF
  %C = zext i8 %a1 to i64
  %CS = shl i64 %C, 40
  %D = or i64 %B, %CS
  store i64 %D, i64* %a0, align 4
  ret void
; X64: test6:
; X64: movb	%sil, 5(%rdi)


; X32: test6:
; X32: movb	8(%esp), %al
; X32: movb	%al, 5(%{{.*}})
}

define i32 @test7(i64* nocapture %a0, i8 zeroext %a1, i32* %P2) nounwind {
entry:
  %OtherLoad = load i32 *%P2
  %A = load i64* %a0, align 4
  %B = and i64 %A, -280375465082881    ; 0xFFFF00FFFFFFFFFF
  %C = zext i8 %a1 to i64
  %CS = shl i64 %C, 40
  %D = or i64 %B, %CS
  store i64 %D, i64* %a0, align 4
  ret i32 %OtherLoad
; X64: test7:
; X64: movb	%sil, 5(%rdi)


; X32: test7:
; X32: movb	8(%esp), %cl
; X32: movb	%cl, 5(%{{.*}})
}

; PR7833

@g_16 = internal global i32 -1

; X64: test8:
; X64-NEXT: movl _g_16(%rip), %eax
; X64-NEXT: movl $0, _g_16(%rip)
; X64-NEXT: orl  $1, %eax
; X64-NEXT: movl %eax, _g_16(%rip)
; X64-NEXT: ret
define void @test8() nounwind {
  %tmp = load i32* @g_16
  store i32 0, i32* @g_16
  %or = or i32 %tmp, 1
  store i32 %or, i32* @g_16
  ret void
}

; X64: test9:
; X64-NEXT: orb $1, _g_16(%rip)
; X64-NEXT: ret
define void @test9() nounwind {
  %tmp = load i32* @g_16
  %or = or i32 %tmp, 1
  store i32 %or, i32* @g_16
  ret void
}

; rdar://8494845 + PR8244
; X64: test10:
; X64-NEXT: movsbl	(%rdi), %eax
; X64-NEXT: shrl	$8, %eax
; X64-NEXT: ret
define i8 @test10(i8* %P) nounwind ssp {
entry:
  %tmp = load i8* %P, align 1
  %conv = sext i8 %tmp to i32
  %shr3 = lshr i32 %conv, 8
  %conv2 = trunc i32 %shr3 to i8
  ret i8 %conv2
}
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`; rdar://7860110`
Don't narrow the load and store in a load+twiddle+store sequence unless there are clearly no stores between the load and the store. This fixes this miscompile reported as PR7833. This breaks the test/CodeGen/X86/narrow_op-2.ll optimization, which is safe, but awkward to prove safe. Move it to X86's README.txt. llvm-svn: 112861 2010-09-02 23:18:42 +02:00			`; RUN: llc -asm-verbose=false < %s \| FileCheck %s -check-prefix=X64`
			`; RUN: llc -march=x86 -asm-verbose=false < %s \| FileCheck %s -check-prefix=X32`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"`
			`target triple = "x86_64-apple-darwin10.2"`

			`define void @test1(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i32* %a0, align 4`
			`%B = and i32 %A, -256 ; 0xFFFFFF00`
			`%C = zext i8 %a1 to i32`
			`%D = or i32 %C, %B`
			`store i32 %D, i32* %a0, align 4`
			`ret void`

teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test1:`
			`; X64: movb %sil, (%rdi)`

			`; X32: test1:`
			`; X32: movb 8(%esp), %al`
			`; X32: movb %al, (%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`

			`define void @test2(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i32* %a0, align 4`
			`%B = and i32 %A, -65281 ; 0xFFFF00FF`
			`%C = zext i8 %a1 to i32`
			`%CS = shl i32 %C, 8`
			`%D = or i32 %B, %CS`
			`store i32 %D, i32* %a0, align 4`
			`ret void`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test2:`
			`; X64: movb %sil, 1(%rdi)`

			`; X32: test2:`
			`; X32: movb 8(%esp), %al`
			`; X32: movb %al, 1(%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`

			`define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i32* %a0, align 4`
			`%B = and i32 %A, -65536 ; 0xFFFF0000`
			`%C = zext i16 %a1 to i32`
			`%D = or i32 %B, %C`
			`store i32 %D, i32* %a0, align 4`
			`ret void`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test3:`
			`; X64: movw %si, (%rdi)`

			`; X32: test3:`
			`; X32: movw 8(%esp), %ax`
			`; X32: movw %ax, (%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`

			`define void @test4(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i32* %a0, align 4`
			`%B = and i32 %A, 65535 ; 0x0000FFFF`
			`%C = zext i16 %a1 to i32`
			`%CS = shl i32 %C, 16`
			`%D = or i32 %B, %CS`
			`store i32 %D, i32* %a0, align 4`
			`ret void`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test4:`
			`; X64: movw %si, 2(%rdi)`

			`; X32: test4:`
Reapply r106634, now that the bug it exposed is fixed. llvm-svn: 106746 2010-06-24 16:30:44 +02:00			`; X32: movl 8(%esp), %eax`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X32: movw %ax, 2(%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`

			`define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i64* %a0, align 4`
			`%B = and i64 %A, -4294901761 ; 0xFFFFFFFF0000FFFF`
			`%C = zext i16 %a1 to i64`
			`%CS = shl i64 %C, 16`
			`%D = or i64 %B, %CS`
			`store i64 %D, i64* %a0, align 4`
			`ret void`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test5:`
			`; X64: movw %si, 2(%rdi)`

			`; X32: test5:`
Enable i16 to i32 promotion by default. llvm-svn: 102493 2010-04-28 10:30:49 +02:00			`; X32: movzwl 8(%esp), %eax`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X32: movw %ax, 2(%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`

			`define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i64* %a0, align 4`
			`%B = and i64 %A, -280375465082881 ; 0xFFFF00FFFFFFFFFF`
			`%C = zext i8 %a1 to i64`
			`%CS = shl i64 %C, 40`
			`%D = or i64 %B, %CS`
			`store i64 %D, i64* %a0, align 4`
			`ret void`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test6:`
			`; X64: movb %sil, 5(%rdi)`


			`; X32: test6:`
			`; X32: movb 8(%esp), %al`
			`; X32: movb %al, 5(%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`
enhance the load/store narrowing optimization to handle a tokenfactor in between the load/store. This allows us to optimize test7 into: _test7: ## @test7 ## BB#0: ## %entry movl (%rdx), %eax ## kill: SIL<def> ESI<kill> movb %sil, 5(%rdi) ret instead of: _test7: ## @test7 ## BB#0: ## %entry movl 4(%esp), %ecx movl $-65281, %eax ## imm = 0xFFFFFFFFFFFF00FF andl 4(%ecx), %eax movzbl 8(%esp), %edx shll $8, %edx addl %eax, %edx movl 12(%esp), %eax movl (%eax), %eax movl %edx, 4(%ecx) ret llvm-svn: 101355 2010-04-15 08:10:49 +02:00
			`define i32 @test7(i64* nocapture %a0, i8 zeroext %a1, i32* %P2) nounwind {`
			`entry:`
			`%OtherLoad = load i32 *%P2`
			`%A = load i64* %a0, align 4`
			`%B = and i64 %A, -280375465082881 ; 0xFFFF00FFFFFFFFFF`
			`%C = zext i8 %a1 to i64`
			`%CS = shl i64 %C, 40`
			`%D = or i64 %B, %CS`
			`store i64 %D, i64* %a0, align 4`
			`ret i32 %OtherLoad`
			`; X64: test7:`
			`; X64: movb %sil, 5(%rdi)`


			`; X32: test7:`
			`; X32: movb 8(%esp), %cl`
			`; X32: movb %cl, 5(%{{.*}})`
			`}`

Don't narrow the load and store in a load+twiddle+store sequence unless there are clearly no stores between the load and the store. This fixes this miscompile reported as PR7833. This breaks the test/CodeGen/X86/narrow_op-2.ll optimization, which is safe, but awkward to prove safe. Move it to X86's README.txt. llvm-svn: 112861 2010-09-02 23:18:42 +02:00			`; PR7833`

			`@g_16 = internal global i32 -1`

			`; X64: test8:`
			`; X64-NEXT: movl _g_16(%rip), %eax`
			`; X64-NEXT: movl $0, _g_16(%rip)`
			`; X64-NEXT: orl $1, %eax`
			`; X64-NEXT: movl %eax, _g_16(%rip)`
			`; X64-NEXT: ret`
			`define void @test8() nounwind {`
			`%tmp = load i32* @g_16`
			`store i32 0, i32* @g_16`
			`%or = or i32 %tmp, 1`
			`store i32 %or, i32* @g_16`
			`ret void`
			`}`

			`; X64: test9:`
			`; X64-NEXT: orb $1, _g_16(%rip)`
			`; X64-NEXT: ret`
			`define void @test9() nounwind {`
			`%tmp = load i32* @g_16`
			`%or = or i32 %tmp, 1`
			`store i32 %or, i32* @g_16`
			`ret void`
			`}`
fix rdar://8494845 + PR8244 - a miscompile exposed by my patch in r101350 llvm-svn: 115294 2010-10-01 07:36:09 +02:00
			`; rdar://8494845 + PR8244`
			`; X64: test10:`
			`; X64-NEXT: movsbl (%rdi), %eax`
			`; X64-NEXT: shrl $8, %eax`
			`; X64-NEXT: ret`
			`define i8 @test10(i8* %P) nounwind ssp {`
			`entry:`
			`%tmp = load i8* %P, align 1`
			`%conv = sext i8 %tmp to i32`
			`%shr3 = lshr i32 %conv, 8`
			`%conv2 = trunc i32 %shr3 to i8`
			`ret i8 %conv2`
			`}`