[Bug tree-optimization/45972] [4.6 Regression] tree check fail in use_pred_not_overlap_with_undef_path_pred
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45972 davidxl changed: What|Removed |Added Status|NEW |RESOLVED Resolution||FIXED --- Comment #3 from davidxl 2010-10-12 22:34:20 UTC --- in r165402.
[Bug target/46200] [4.6 Regression] optimization regression in simple pointer loop
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46200 davidxl changed: What|Removed |Added Status|NEW |ASSIGNED CC||davidxl at gcc dot gnu.org --- Comment #5 from davidxl 2010-10-28 19:01:16 UTC --- Confirmed. The problem seems to be in the cost computation for loop exit tests -- the cost associated with iv update seem to be double counted (already considered as iv cost, but included again in testing cost). If this is the root cause, it is there since day-1, but exposed by the ivopt enhancement patch. David
[Bug rtl-optimization/46235] New: inefficient bittest code generation
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46235 Summary: inefficient bittest code generation Product: gcc Version: 4.6.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: rtl-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: davi...@gcc.gnu.org CC: xinlian...@gmail.com Test case: int foo(int a, int x, int y) { if (a & (1 << x)) return a; return 1; } Trunk gcc generates: foo: .LFB0: .cfi_startproc movl%edi, %eax movl%edi, %edx movl%esi, %ecx sarl%cl, %edx andl$1, %edx movl$1, %edx cmove%edx, %eax ret Trunk llvm (with clang) generates: foo: .Leh_func_begin0: btl%esi, %edi movl$1, %eax cmovbl%edi, %eax ret
[Bug tree-optimization/46236] New: Local aggregate not eliminated
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46236 Summary: Local aggregate not eliminated Product: gcc Version: 4.6.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: davi...@gcc.gnu.org CC: xinlian...@gmail.com Simple Test case: struct A { int a[100]; }; const struct A aa = {1,1,1}; int foo(int i) { int s = 0; struct A a; a = aa; s = a.a[i]; if (i > 5) s+=a.a[i]; return s; } // Trunk gcc generates: (O2) foo: .LFB0: .cfi_startproc subq$280, %rsp .cfi_def_cfa_offset 288 movl%edi, %edx xorl%eax, %eax leaq-120(%rsp), %rdi cmpl$6, %edx movl$50, %ecx rep stosq movl$1, -120(%rsp) movl$1, -116(%rsp) movl$1, -112(%rsp) movslq%edx, %rax movl-120(%rsp,%rax,4), %eax leal(%rax,%rax), %ecx cmovge%ecx, %eax addq$280, %rsp .cfi_def_cfa_offset 8 ret // Trunk LLVM generates: (O2) foo: .Leh_func_begin0: movslq%edi, %rcx movlaa(,%rcx,4), %eax cmpl$6, %ecx jl.LBB0_2 addl%eax, %eax .LBB0_2: ret
[Bug rtl-optimization/46279] New: cmov not hoisted out of the loop
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46279 Summary: cmov not hoisted out of the loop Product: gcc Version: 4.6.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: rtl-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: davi...@gcc.gnu.org Simple test case: extern int gen_int(int); extern void ref_int_p(int*); void kernel3 () { int i; int j; int k; int l; int m; int a[200]; j = gen_int (0); k = gen_int (0); for (i = 0; i < 200; i++) { if (j < k) a[i] = 1; else a[i] = j; } ref_int_p (&a[0]); return; } Code generated by trunk gcc at O2: kernel3: .LFB0: .cfi_startproc pushq%rbx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 xorl%edi, %edi subq$800, %rsp .cfi_def_cfa_offset 816 callgen_int xorl%edi, %edi movl%eax, %ebx callgen_int movq%rsp, %rdx leaq800(%rsp), %rdi movl$1, %esi .p2align 4,,10 .p2align 3 .L4: cmpl%eax, %ebx movl%esi, %ecx cmovge%ebx, %ecx movl%ecx, (%rdx) addq$4, %rdx cmpq%rdi, %rdx jne.L4 movq%rsp, %rdi callref_int_p addq$800, %rsp .cfi_def_cfa_offset 16 popq%rbx .cfi_def_cfa_offset 8 ret The loop header is L4. LLVM generates: .Leh_func_begin0: pushq%rbx .Ltmp0: subq$800, %rsp .Ltmp1: xorl%edi, %edi callqgen_int movl%eax, %ebx xorl%edi, %edi callqgen_int cmpl%eax, %ebx movl$1, %eax cmovgel%ebx, %eax xorl%ecx, %ecx .align16, 0x90 .LBB0_1: movl%eax, (%rsp,%rcx,4) incq%rcx cmpq$200, %rcx jne.LBB0_1 leaq(%rsp), %rdi callqref_int_p addq$800, %rsp popq%rbx The loop (LBB0_1) is much tighter. David
[Bug tree-optimization/46281] New: Inefficient unswitching (too many copies)
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46281 Summary: Inefficient unswitching (too many copies) Product: gcc Version: 4.6.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: davi...@gcc.gnu.org Compiling the following program with -O3: extern int gen_int(int); extern void ref_int_p(int*); void kernel3 () { int i; int j; int k; int l; int m; int a[200]; j = gen_int (0); k = gen_int (0); l = gen_int (0); m = gen_int (0); for (i = 0; i < 200; i++) { if (j < k || j < l || j < m ) // || j << 3 || k << 4) a[i] = 1; else a[i] -= j; } ref_int_p (&a[0]); return; } Gcc unswitches the loop, but generate three copies of the loop -- it should only generate 2 copies. LLVM correctly generates two copies. David
[Bug rtl-optimization/46265] Missing ifcvt
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46265 davidxl changed: What|Removed |Added CC||davidxl at gcc dot gnu.org --- Comment #2 from davidxl 2010-11-03 05:24:54 UTC --- The following is another example gcc fails to ifcvt, while llvm performs ifcvt + hoist the conditional assignment out of the loop. 2 extern int gen_int(int); 3 extern void ref_int_p(int*); 4 5 void kernel3 () 6 { 7 int i; 8 int j; 9 int k; 10 int l; 11 int m; 12 int a[200]; 13 14 j = gen_int (0); 15 k = gen_int (0); 16 l = gen_int (0); 17 m = gen_int (0); 18 19 for (i = 0; i < 200; i++) 20 { 21 if (j < k || j < l || j < m ) 22 a[i] = 1; 23 else 24 a[i] = j; 25 } 26 27 ref_int_p (&a[0]); 28 29 return; 30 }
[Bug rtl-optimization/46265] Missing ifcvt
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46265 --- Comment #3 from davidxl 2010-11-03 05:59:30 UTC --- Another example gcc fails to ifcvt (succeeds only if only one statement is in if and else block. void ref_int_p(int *); void foo (int j, int k) { int i; int a[200], b[100]; i = 0; for ( ; i < 100; i++) { if (j < k) { a[i] = j; b[i] = j; } else { a[i] = 1; b[i] = 1; } } ref_int_p (&a[0]); ref_int_p (&b[0]); return; } It (the loop) should be generated like: LBB0_1: cmpl%esi, %edi movl$1, %ecx cmovll %edi, %ecx movl%ecx, -800(%rbp,%rax,4) movl%ecx, -1200(%rbp,%rax,4) incq%rax cmpq$100, %rax jne .LBB0_1 David
[Bug target/46200] [4.6 Regression] optimization regression in simple pointer loop
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46200 davidxl changed: What|Removed |Added Status|ASSIGNED|RESOLVED Resolution||FIXED --- Comment #6 from davidxl 2010-11-03 22:40:56 UTC --- Fix in r166280.
[Bug tree-optimization/46306] New: inefficient code generated for array accesses
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46306 Summary: inefficient code generated for array accesses Product: gcc Version: 4.6.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: davi...@gcc.gnu.org //Example: int foo (int i, int *p, int t) { int p2 = p[i]; int temp = 0; int temp2 = 1; int temp3 = 4; if (p[i+1] > t) { temp = p2; temp2 = p2 + 2; temp3 = p2 + 3; } return p[temp] + p [temp2] + p[temp3]; } Two problems seen the code generated by trunk gcc at -O2 1) all the shift operation are redundant and should be folded as the stride in the memory operand 2) unnecessary code duplication (may be handled by a pass that converts memory access with linear address into target memref in straight line code) foo: .LFB0: .cfi_startproc movslq%edi, %rdi movl(%rsi,%rdi,4), %eax cmpl%edx, 4(%rsi,%rdi,4) jle.L3 movslq%eax, %rdi leal2(%rax), %ecx salq$2, %rdi leal3(%rax), %edx movslq%ecx, %rcx movl(%rsi,%rdi), %eax salq$2, %rcx movslq%edx, %rdx addl(%rsi,%rcx), %eax salq$2, %rdx addl(%rsi,%rdx), %eax ret .p2align 4,,10 .p2align 3 .L3: movl$16, %edx movl$4, %ecx xorl%edi, %edi movl(%rsi,%rdi), %eax addl(%rsi,%rcx), %eax addl(%rsi,%rdx), %eax ret // The following code is generated by another compiler -- not ideal, but better: foo: .Leh_func_begin0: pushq%rbp .Ltmp0: movq%rsp, %rbp .Ltmp1: movslq%edi, %rax leal1(%rax), %ecx movslq%ecx, %rcx cmpl%edx, (%rsi,%rcx,4) jg.LBB0_2 movl$1, %eax xorl%ecx, %ecx movl$4, %edx jmp.LBB0_3 .LBB0_2: movslq(%rsi,%rax,4), %rcx leal3(%rcx), %eax movslq%eax, %rdx leal2(%rcx), %eax movslq%eax, %rax .LBB0_3: movl(%rsi,%rax,4), %eax addl(%rsi,%rcx,4), %eax addl(%rsi,%rdx,4), %eax popq%rbp ret