llvm-mirror/test/CodeGen/X86/store-narrow.ll

; rdar://7860110
; RUN: llc < %s | FileCheck %s -check-prefix=X64
; RUN: llc -march=x86 < %s | FileCheck %s -check-prefix=X32
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.2"

define void @test1(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {
entry:
  %A = load i32* %a0, align 4
  %B = and i32 %A, -256     ; 0xFFFFFF00
  %C = zext i8 %a1 to i32
  %D = or i32 %C, %B
  store i32 %D, i32* %a0, align 4
  ret void
  
; X64: test1:
; X64: movb	%sil, (%rdi)

; X32: test1:
; X32: movb	8(%esp), %al
; X32: movb	%al, (%{{.*}})
}

define void @test2(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {
entry:
  %A = load i32* %a0, align 4
  %B = and i32 %A, -65281    ; 0xFFFF00FF
  %C = zext i8 %a1 to i32
  %CS = shl i32 %C, 8
  %D = or i32 %B, %CS
  store i32 %D, i32* %a0, align 4
  ret void
; X64: test2:
; X64: movb	%sil, 1(%rdi)

; X32: test2:
; X32: movb	8(%esp), %al
; X32: movb	%al, 1(%{{.*}})
}

define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
entry:
  %A = load i32* %a0, align 4
  %B = and i32 %A, -65536    ; 0xFFFF0000
  %C = zext i16 %a1 to i32
  %D = or i32 %B, %C
  store i32 %D, i32* %a0, align 4
  ret void
; X64: test3:
; X64: movw	%si, (%rdi)

; X32: test3:
; X32: movw	8(%esp), %ax
; X32: movw	%ax, (%{{.*}})
}

define void @test4(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
entry:
  %A = load i32* %a0, align 4
  %B = and i32 %A, 65535    ; 0x0000FFFF
  %C = zext i16 %a1 to i32
  %CS = shl i32 %C, 16
  %D = or i32 %B, %CS
  store i32 %D, i32* %a0, align 4
  ret void
; X64: test4:
; X64: movw	%si, 2(%rdi)

; X32: test4:
; X32: movzwl	8(%esp), %eax
; X32: movw	%ax, 2(%{{.*}})
}

define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
entry:
  %A = load i64* %a0, align 4
  %B = and i64 %A, -4294901761    ; 0xFFFFFFFF0000FFFF
  %C = zext i16 %a1 to i64
  %CS = shl i64 %C, 16
  %D = or i64 %B, %CS
  store i64 %D, i64* %a0, align 4
  ret void
; X64: test5:
; X64: movw	%si, 2(%rdi)

; X32: test5:
; X32: movzwl	8(%esp), %eax
; X32: movw	%ax, 2(%{{.*}})
}

define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {
entry:
  %A = load i64* %a0, align 4
  %B = and i64 %A, -280375465082881    ; 0xFFFF00FFFFFFFFFF
  %C = zext i8 %a1 to i64
  %CS = shl i64 %C, 40
  %D = or i64 %B, %CS
  store i64 %D, i64* %a0, align 4
  ret void
; X64: test6:
; X64: movb	%sil, 5(%rdi)


; X32: test6:
; X32: movb	8(%esp), %al
; X32: movb	%al, 5(%{{.*}})
}

define i32 @test7(i64* nocapture %a0, i8 zeroext %a1, i32* %P2) nounwind {
entry:
  %OtherLoad = load i32 *%P2
  %A = load i64* %a0, align 4
  %B = and i64 %A, -280375465082881    ; 0xFFFF00FFFFFFFFFF
  %C = zext i8 %a1 to i64
  %CS = shl i64 %C, 40
  %D = or i64 %B, %CS
  store i64 %D, i64* %a0, align 4
  ret i32 %OtherLoad
; X64: test7:
; X64: movb	%sil, 5(%rdi)


; X32: test7:
; X32: movb	8(%esp), %cl
; X32: movb	%cl, 5(%{{.*}})
}
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`; rdar://7860110`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; RUN: llc < %s \| FileCheck %s -check-prefix=X64`
			`; RUN: llc -march=x86 < %s \| FileCheck %s -check-prefix=X32`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"`
			`target triple = "x86_64-apple-darwin10.2"`

			`define void @test1(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i32* %a0, align 4`
			`%B = and i32 %A, -256 ; 0xFFFFFF00`
			`%C = zext i8 %a1 to i32`
			`%D = or i32 %C, %B`
			`store i32 %D, i32* %a0, align 4`
			`ret void`

teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test1:`
			`; X64: movb %sil, (%rdi)`

			`; X32: test1:`
			`; X32: movb 8(%esp), %al`
			`; X32: movb %al, (%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`

			`define void @test2(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i32* %a0, align 4`
			`%B = and i32 %A, -65281 ; 0xFFFF00FF`
			`%C = zext i8 %a1 to i32`
			`%CS = shl i32 %C, 8`
			`%D = or i32 %B, %CS`
			`store i32 %D, i32* %a0, align 4`
			`ret void`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test2:`
			`; X64: movb %sil, 1(%rdi)`

			`; X32: test2:`
			`; X32: movb 8(%esp), %al`
			`; X32: movb %al, 1(%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`

			`define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i32* %a0, align 4`
			`%B = and i32 %A, -65536 ; 0xFFFF0000`
			`%C = zext i16 %a1 to i32`
			`%D = or i32 %B, %C`
			`store i32 %D, i32* %a0, align 4`
			`ret void`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test3:`
			`; X64: movw %si, (%rdi)`

			`; X32: test3:`
			`; X32: movw 8(%esp), %ax`
			`; X32: movw %ax, (%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`

			`define void @test4(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i32* %a0, align 4`
			`%B = and i32 %A, 65535 ; 0x0000FFFF`
			`%C = zext i16 %a1 to i32`
			`%CS = shl i32 %C, 16`
			`%D = or i32 %B, %CS`
			`store i32 %D, i32* %a0, align 4`
			`ret void`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test4:`
			`; X64: movw %si, 2(%rdi)`

			`; X32: test4:`
Enable i16 to i32 promotion by default. llvm-svn: 102493 2010-04-28 10:30:49 +02:00			`; X32: movzwl 8(%esp), %eax`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X32: movw %ax, 2(%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`

			`define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i64* %a0, align 4`
			`%B = and i64 %A, -4294901761 ; 0xFFFFFFFF0000FFFF`
			`%C = zext i16 %a1 to i64`
			`%CS = shl i64 %C, 16`
			`%D = or i64 %B, %CS`
			`store i64 %D, i64* %a0, align 4`
			`ret void`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test5:`
			`; X64: movw %si, 2(%rdi)`

			`; X32: test5:`
Enable i16 to i32 promotion by default. llvm-svn: 102493 2010-04-28 10:30:49 +02:00			`; X32: movzwl 8(%esp), %eax`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X32: movw %ax, 2(%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`

			`define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {`
			`entry:`
			`%A = load i64* %a0, align 4`
			`%B = and i64 %A, -280375465082881 ; 0xFFFF00FFFFFFFFFF`
			`%C = zext i8 %a1 to i64`
			`%CS = shl i64 %C, 40`
			`%D = or i64 %B, %CS`
			`store i64 %D, i64* %a0, align 4`
			`ret void`
teach codegen to turn trunc(zextload) into load when possible. This doesn't occur much at all, it only seems to formed in the case when the trunc optimization kicks in due to phase ordering. In that case it is saves a few bytes on x86-32. llvm-svn: 101350 2010-04-15 07:40:59 +02:00			`; X64: test6:`
			`; X64: movb %sil, 5(%rdi)`


			`; X32: test6:`
			`; X32: movb 8(%esp), %al`
			`; X32: movb %al, 5(%{{.*}})`
Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret llvm-svn: 101343 2010-04-15 06:48:01 +02:00			`}`
enhance the load/store narrowing optimization to handle a tokenfactor in between the load/store. This allows us to optimize test7 into: _test7: ## @test7 ## BB#0: ## %entry movl (%rdx), %eax ## kill: SIL<def> ESI<kill> movb %sil, 5(%rdi) ret instead of: _test7: ## @test7 ## BB#0: ## %entry movl 4(%esp), %ecx movl $-65281, %eax ## imm = 0xFFFFFFFFFFFF00FF andl 4(%ecx), %eax movzbl 8(%esp), %edx shll $8, %edx addl %eax, %edx movl 12(%esp), %eax movl (%eax), %eax movl %edx, 4(%ecx) ret llvm-svn: 101355 2010-04-15 08:10:49 +02:00
			`define i32 @test7(i64* nocapture %a0, i8 zeroext %a1, i32* %P2) nounwind {`
			`entry:`
			`%OtherLoad = load i32 *%P2`
			`%A = load i64* %a0, align 4`
			`%B = and i64 %A, -280375465082881 ; 0xFFFF00FFFFFFFFFF`
			`%C = zext i8 %a1 to i64`
			`%CS = shl i64 %C, 40`
			`%D = or i64 %B, %CS`
			`store i64 %D, i64* %a0, align 4`
			`ret i32 %OtherLoad`
			`; X64: test7:`
			`; X64: movb %sil, 5(%rdi)`


			`; X32: test7:`
			`; X32: movb 8(%esp), %cl`
			`; X32: movb %cl, 5(%{{.*}})`
			`}`