simple example -O3 optimizer problem?

2014-07-18 Thread Dennis Luehring

tested following code with

http://gcc.godbolt.org/
tested with
g++-4.8 (Ubuntu 4.8.1.2ubuntu1~12.04) 4.8.1
g++ (GCC) 4.9.0 20130909 (experimental)

and the result with -O3 + defined USE_ITER seems to be a little bit long

--

static void foo(int a, int& dummy)
{
  dummy += a;
}

#define USE_ITER

int main(int argc, char** argv)
{
  //--
  //anti-optimizer
  int dummy = 0;
  int* array = (int*)&argv;
  //--

#if defined(USE_ITER)
  int* pend = &array[10];
  for(int* p = &array[0]; p < pend; ++p) foo(*p, dummy);
#else
  for(int i = 0; i < 10; ++i) foo(array[i], dummy);
#endif

  return dummy;
}

-

with -O2
with/without define USE_ITER

result:

main:
leardx, [rsp-8]
learcx, [rsp+32]
movQWORD PTR [rsp-8], rsi
xoreax, eax
.L3:
addeax, DWORD PTR [rdx]
addrdx, 4
cmprdx, rcx
jb.L3
rep; ret

with -O3

without define USE_ITER

main:
movrax, rsi
shrrax, 32
addeax, esi
addeax, DWORD PTR [rsp]
addeax, DWORD PTR [rsp+4]
addeax, DWORD PTR [rsp+8]
addeax, DWORD PTR [rsp+12]
addeax, DWORD PTR [rsp+16]
addeax, DWORD PTR [rsp+20]
addeax, DWORD PTR [rsp+24]
addeax, DWORD PTR [rsp+28]
ret

with define USE_ITER

main:
leardi, [rsp-16]
learax, [rsp+27]
movQWORD PTR [rsp-16], rsi
lear8, [rsp+24]
movecx, 1
leardx, [rdi+4]
learsi, [rdi+1]
subrax, rdx
movrdx, rdi
shrrax, 2
addrax, 1
cmprsi, r8
cmovbercx, rax
andedx, 15
shrrdx, 2
cmprcx, rdx
cmovberdx, rcx
cmprax, 8
ja.L30
.L2:
movrdx, rcx
.L11:
cmprdx, 1
moveax, DWORD PTR [rsp-16]
je.L13
addeax, DWORD PTR [rsp-12]
cmprdx, 2
je.L14
addeax, DWORD PTR [rsp-8]
cmprdx, 3
je.L15
addeax, DWORD PTR [rsp-4]
cmprdx, 4
je.L16
addeax, DWORD PTR [rsp]
cmprdx, 5
je.L17
addeax, DWORD PTR [rsp+4]
cmprdx, 6
je.L18
addeax, DWORD PTR [rsp+8]
cmprdx, 7
je.L19
addeax, DWORD PTR [rsp+12]
learsi, [rsp+16]
.L4:
cmprcx, rdx
je.L23
.L3:
subrcx, rdx
movr9, rcx
shrr9, 2
lear10, [0+r9*4]
testr10, r10
je.L6
leardx, [rdi+rdx*4]
cmpr9, 1
movdquxmm0, XMMWORD PTR [rdx]
jbe.L7
movdquxmm1, XMMWORD PTR [rdx+16]
cmpr9, 2
padddxmm0, xmm1
je.L7
movdquxmm1, XMMWORD PTR [rdx+32]
padddxmm0, xmm1
.L7:
movdqaxmm2, xmm0
learsi, [rsi+r10*4]
psrldqxmm2, 8
padddxmm0, xmm2
movdqaxmm3, xmm0
psrldqxmm3, 4
padddxmm0, xmm3
movdedx, xmm0
addeax, edx
cmprcx, r10
je.L23
.L6:
leardx, [rsi+4]
addeax, DWORD PTR [rsi]
cmpr8, rdx
jbe.L23
leardx, [rsi+8]
addeax, DWORD PTR [rsi+4]
cmpr8, rdx
jbe.L31
addeax, DWORD PTR [rsi+8]
ret
.L23:
rep; ret
.L30:
cmprsi, r8
ja.L2
xoreax, eax
testrdx, rdx
movrsi, rdi
je.L3
jmp.L11
.L31:
ret
.L16:
movrsi, rsp
jmp.L4
.L17:
learsi, [rsp+4]
jmp.L4
.L18:
learsi, [rsp+8]
jmp.L4
.L19:
learsi, [rsp+12]
jmp.L4
.L14:
learsi, [rsp-8]
jmp.L4
.L15:
learsi, [rsp-4]
jmp.L4
.L13:
learsi, [rdi+4]
jmp.L4




Re: simple example -O3 optimizer problem?

2014-07-18 Thread Andrew Haley
On 18/07/14 08:30, Dennis Luehring wrote:
>int* array = (int*)&argv;

This looks like undefined behaviour.  Don't you get a warning?

Andrew.



Re: simple example -O3 optimizer problem?

2014-07-18 Thread Dennis Luehring

Am 18.07.2014 10:29, schrieb Andrew Haley:

On 18/07/14 08:30, Dennis Luehring wrote:
>int* array = (int*)&argv;

This looks like undefined behaviour.  Don't you get a warning?

Andrew.



no warning - its an valid typed pointer to stack and i don't care what 
the values are

its just an anti-optimizer trick nothing more




Re: simple example -O3 optimizer problem?

2014-07-18 Thread Andrew Haley
On 07/18/2014 09:40 AM, Dennis Luehring wrote:
> Am 18.07.2014 10:29, schrieb Andrew Haley:
>> On 18/07/14 08:30, Dennis Luehring wrote:
>>>int* array = (int*)&argv;
>>
>> This looks like undefined behaviour.  Don't you get a warning?
> 
> no warning - its an valid typed pointer to stack and i don't care what 
> the values are
> its just an anti-optimizer trick nothing more

And if you give it the correct type, doe that make any difference?

Andrew.



Re: simple example -O3 optimizer problem?

2014-07-18 Thread Dennis Luehring

Am 18.07.2014 11:14, schrieb Andrew Haley:

On 07/18/2014 09:40 AM, Dennis Luehring wrote:
> Am 18.07.2014 10:29, schrieb Andrew Haley:
>> On 18/07/14 08:30, Dennis Luehring wrote:
>>>int* array = (int*)&argv;
>>
>> This looks like undefined behaviour.  Don't you get a warning?
>
> no warning - its an valid typed pointer to stack and i don't care what
> the values are
> its just an anti-optimizer trick nothing more

And if you give it the correct type, doe that make any difference?




static void foo(int a, int& dummy)
{
  dummy += a;
}

#define USE_CHAR
#define USE_ITER

int main(int argc, char** argv)
{
  //--
  //anti-optimizer
  int dummy = 0;
#if defined(USE_CHAR)
  typedef char T;
  T* array = argv[0];
#else
  typedef int T;
  T* array = (int*)&argv; // undefined?
#endif
  //--

#if defined(USE_ITER)
  T* pend = &array[10];
  for(T* p = &array[0]; p < pend; ++p) foo(*p, dummy);
#else
  for(int i = 0; i < 10; ++i) foo(array[i], dummy);
#endif

  return dummy;
}

with USE_CHAR, with/without USE_ITER, -O3 gives:

main:
movrdx, QWORD PTR [rsi]
movsxecx, BYTE PTR [rdx]
movsxeax, BYTE PTR [rdx+1]
addeax, ecx
movsxecx, BYTE PTR [rdx+2]
addeax, ecx
movsxecx, BYTE PTR [rdx+3]
addeax, ecx
movsxecx, BYTE PTR [rdx+4]
addeax, ecx
movsxecx, BYTE PTR [rdx+5]
addeax, ecx
movsxecx, BYTE PTR [rdx+6]
addeax, ecx
movsxecx, BYTE PTR [rdx+7]
addeax, ecx
movsxecx, BYTE PTR [rdx+8]
movsxedx, BYTE PTR [rdx+9]
addeax, ecx
addeax, edx
ret

without USE_CHAR, without USE_ITER, -O3 gives:

main:
movrax, rsi
shrrax, 32
addeax, esi
addeax, DWORD PTR [rsp]
addeax, DWORD PTR [rsp+4]
addeax, DWORD PTR [rsp+8]
addeax, DWORD PTR [rsp+12]
addeax, DWORD PTR [rsp+16]
addeax, DWORD PTR [rsp+20]
addeax, DWORD PTR [rsp+24]
addeax, DWORD PTR [rsp+28]
ret

without USE_CHAR, with USE_ITER, -O3 gives

main:
leardi, [rsp-16]
learax, [rsp+27]
movQWORD PTR [rsp-16], rsi
lear8, [rsp+24]
movecx, 1
leardx, [rdi+4]
learsi, [rdi+1]
subrax, rdx
movrdx, rdi
shrrax, 2
addrax, 1
cmprsi, r8
cmovbercx, rax
andedx, 15
shrrdx, 2
cmprcx, rdx
cmovberdx, rcx
cmprax, 8
ja.L30
.L2:
movrdx, rcx
.L11:
cmprdx, 1
moveax, DWORD PTR [rsp-16]
je.L13
addeax, DWORD PTR [rsp-12]
cmprdx, 2
je.L14
addeax, DWORD PTR [rsp-8]
cmprdx, 3
je.L15
addeax, DWORD PTR [rsp-4]
cmprdx, 4
je.L16
addeax, DWORD PTR [rsp]
cmprdx, 5
je.L17
addeax, DWORD PTR [rsp+4]
cmprdx, 6
je.L18
addeax, DWORD PTR [rsp+8]
cmprdx, 7
je.L19
addeax, DWORD PTR [rsp+12]
learsi, [rsp+16]
.L4:
cmprcx, rdx
je.L23
.L3:
subrcx, rdx
movr9, rcx
shrr9, 2
lear10, [0+r9*4]
testr10, r10
je.L6
leardx, [rdi+rdx*4]
cmpr9, 1
movdquxmm0, XMMWORD PTR [rdx]
jbe.L7
movdquxmm1, XMMWORD PTR [rdx+16]
cmpr9, 2
padddxmm0, xmm1
je.L7
movdquxmm1, XMMWORD PTR [rdx+32]
padddxmm0, xmm1
.L7:
movdqaxmm2, xmm0
learsi, [rsi+r10*4]
psrldqxmm2, 8
padddxmm0, xmm2
movdqaxmm3, xmm0
psrldqxmm3, 4
padddxmm0, xmm3
movdedx, xmm0
addeax, edx
cmprcx, r10
je.L23
.L6:
leardx, [rsi+4]
addeax, DWORD PTR [rsi]
cmpr8, rdx
jbe.L23
leardx, [rsi+8]
addeax, DWORD PTR [rsi+4]
cmpr8, rdx
jbe.L31
addeax, DWORD PTR [rsi+8]
ret
.L23:
rep; ret
.L30:
cmprsi, r8
ja.L2
xoreax, eax
testrdx, rdx
movrsi, rdi
je.L3
jmp.L11
.L31:
ret
.L16:
movrsi, rsp
jmp.L4
.L17:
learsi, [rsp+4]
jmp.L4
.L18:
learsi, [rsp+8]
jmp.L4
.L19:
learsi, [rsp+12]
jmp.L4
.L14:
learsi, [rsp-8]
jmp.L4
.L15:
learsi, [rsp-4]
jmp.L4
.L13:
learsi, [rdi+4]
jmp.L4

but is still don't get whats undefined is with (int*)&argv - i 
understand that the values are undefined (pointing anywhere in stack)
but why should the -O3 optimization generate such amount of code due to 
this "undefined behavior"


strangely for clang 3.4.1 behaves different

with USE_CHAR, without USE_ITER, -O3 gives:

main:   # @main
movq(%rsi), %rax
movsbl(%rax), %ecx
movsbl1(%rax), %edx
addl%ecx, %edx
movsbl2(%rax), %ecx
addl%edx, %ecx
movsbl3(%rax), %edx
addl%ecx, %edx
movsbl4(

Re: What would it take to always force indirect inlining?

2014-07-18 Thread Martin Jambor
Hi,

On Thu, Jul 17, 2014 at 12:26:43PM -0500, Daniel Santos wrote:
> I've recently discovered that a function marked always_inline but
> called by pointer won't always be inlined. What would it take to
> assure that this either always happens or generates an error?

Generally, that is the case.  Direct non-inlined calls of
always_inline functions thus indicate that the destination become
known only after inlining in the optimization pipeline.  Without more
context, especially without knowing how exactly you call my_quicksort
function, it is impossible to say why.  If you cannot povide us with
the source code (I would be interested if you can), you can try
compiling with -fdump-tree-all -fdump-ipa-all-details and go through
the generated dump files to find out at what point it becomes known.
ipa-cp and ipa-inline dumps also contain information about what
inlining context of your function.

Martin

> Unfortunately, it's breaking (well, failing to properly optimize)
> some code where I need the optimizer to see what's in the inline
> function (which is a constant at compile time) so it can optimize it
> into a REP MOVSx loop on x86 or similar on other archs. I kinda
> designed the function so that it would work that way, but it ends up
> making a function call and then can't optimize any further.
> 
> 
> 
> static __always_inline void my_copy(const struct qsort_def *def,
> void *dest, const void *src) {
> const struct size_type __aligned(ALIGN_SIZE) *s = src;
> struct size_type __aligned(ALIGN_SIZE) *d = dest;
> //fprintf(stderr, "copy: d=%p, s=%p\n", d, s);
> *d = *s;
> 0020  mov(%rdx),%rax
> 0023  mov%rax,(%rsi)
> 0026  retq
> 
> ...
> 
> static __always_inline __flatten void
> _quicksort_ror(const struct qsort_def *def, void *left, void *right,
> void *tmp, size_t tmp_size) {
> const size_t size = def->size;
> char *r = right;
> char *l = left;
> const ssize_t dist = (r - l) / (ssize_t)def->size; /* left
> to right offset */
> 03c1  sub %rbx,%rdx
> 03c4  test %rdx,%rdx
> 03c7  lea 0x7(%rdx),%r12
> 03cb  cmovns %rdx,%r12
> 
> if (size <= tmp_size) {
> ssize_t i;
> char *left_minus_one = l - size;
> 
> def->copy(def, tmp, r);
> 03cf  mov %r13,%rdx
> static __always_inline __flatten void
> _quicksort_ror(const struct qsort_def *def, void *left, void *right,
> void *tmp, size_t tmp_size) {
> const size_t size = def->size;
> char *r = right;
> char *l = left;
> const ssize_t dist = (r - l) / (ssize_t)def->size; /* left
> to right offset */
> 03d2  sar $0x3,%r12
> 
> if (size <= tmp_size) {
> ssize_t i;
> char *left_minus_one = l - size;
> 
> def->copy(def, tmp, r);
> 03d6  callq 0020
> 
> /* rep movs-friendly loop */
> for (i = dist; i; --i) {
> 03db  test %r12,%r12
> 03de  je 041d
> 
> 03e0  lea 0x0(,%r12,8),%rdx
> 03e8  lea (%rbx,%rdx,1),%r14
> 03ec  add %rdx,%r15
> 03ef  xchg   %ax,%ax
> 03f1  data32 data32 data32
> data32 data32 nopw %cs:0x0(%rax,%rax,1)
> def->copy(def, &l[i * size],
> &left_minus_one[i * size]);
> 0400  mov %r15,%rdx
> 0403  mov %r14,%rsi
> 0406  mov $0x0,%edi
> 407: R_X86_64_32.rodata+0x20
> 040b  callq 0020
> 
> 0410  sub $0x8,%r14
> 0414  sub $0x8,%r15
> ssize_t i;
> char *left_minus_one = l - size;
> 
> def->copy(def, tmp, r);
> /* rep movs-friendly loop */
> for (i = dist; i; --i) {
> 0418  dec%r12
> 041b  jne 0400
> 
> def->copy(def, &l[i * size],
> &left_minus_one[i * size]);
> }
> 
> def->copy(def, left, tmp);
> 041d  mov -0x450(%rbp),%rdx
> 0424  mov %rbx,%rsi
> 0427  mov $0x0,%edi
> 428: R_X86_64_32.rodata+0x20
> 042c  callq 0020
> 
> 0431  jmpq 0378
> 
> 
> 
> 
> If the optimizer had the body of my_copy above, it should be able to
> use two pointers (one for l and another for left_minus_one) and a
> single index as long as size is either 1, 2, 4 or 8.  All and all, I
> need to refine my strategy, but if I can solve this little part, it
> will help greatly.
> 
> Thanks,
> Daniel


Re: mn10300, invariants on DEP_PRO/DEP_CON and on TARGET_SCHED_ADJUST_COST params

2014-07-18 Thread Jeff Law

On 07/08/14 14:21, David Malcolm wrote:

[CCing nickc, who wrote the mn10300 hook in question]

I'm experimenting with separating out instructions from expressions in
RTL; see [1] for more info on that.

I noticed that mn10300 has this implementation of a target hook:
   #define TARGET_SCHED_ADJUST_COST mn10300_adjust_sched_cost

Within mn10300_adjust_sched_cost (where "insn" and "dep" are the first
and third parameters respectively), there's this code:

   if (GET_CODE (insn) == PARALLEL)
 insn = XVECEXP (insn, 0, 0);

   if (GET_CODE (dep) == PARALLEL)
 dep = XVECEXP (dep, 0, 0);

However, I believe that these params of this hook ("insn") always
satisfy INSN_CHAIN_CODE_P, and so can't have code PARALLEL.  [Nick: did
those conditionals ever get triggered, or was this defensive coding?]

Specifically, the hook is called from haifa-sched.c:dep_cost_1 on the
DEP_CON and DEP_PRO of a dep_t.

It's my belief that DEP_CON and DEP_PRO always satisfy INSN_CHAIN_CODE_P
- and on every other config so far that seems to be the case.

Is my belief about DEP_CON/DEP_PRO correct?  (or, at least, consistent
with other gcc developers' views on the matter :))  My patch kit [2] has
this expressed in the type system as of [3], so if I'm incorrect about
this I'd prefer to know ASAP.

Similarly, do the first and third params of TARGET_SCHED_ADJUST_COST
also satisfy INSN_CHAIN_CODE_P?

I suspect these should be
if (GET_CODE (PATTERN (insn)) == PARALLEL)

and similarly for DEP

That way they're doing something sensible for define_insns which have 
patterns that are PARALLELs (where typically the first element is the 
only one that is interesting).


Feel free to make that change independent of the RTL classes work you're 
doing and consider it pre-approved with some sensible sanity testing.


Jeff



Re: What would it take to always force indirect inlining?

2014-07-18 Thread Daniel Santos


On 07/18/2014 04:55 AM, Martin Jambor wrote:

Hi,

On Thu, Jul 17, 2014 at 12:26:43PM -0500, Daniel Santos wrote:

I've recently discovered that a function marked always_inline but
called by pointer won't always be inlined. What would it take to
assure that this either always happens or generates an error?

Generally, that is the case.  Direct non-inlined calls of
always_inline functions thus indicate that the destination become
known only after inlining in the optimization pipeline.  Without more
context, especially without knowing how exactly you call my_quicksort
function, it is impossible to say why.  If you cannot povide us with
the source code (I would be interested if you can), you can try
compiling with -fdump-tree-all -fdump-ipa-all-details and go through
the generated dump files to find out at what point it becomes known.
ipa-cp and ipa-inline dumps also contain information about what
inlining context of your function.

Martin


Thank you. I most certainly don't mind sharing all of the sources, they 
are just not in a refined state. I have a little time to import them 
into github now, but if I run out of time, I'll get them in later today.


Daniel


Successfull bootstrap 4.9.1 (GCC) native x86_64-w64-mingw32

2014-07-18 Thread Rainer Emrich
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

msys2 x86_64-w64-mingw32 setup on windows 7 system with an Intel(R) Xeon(R) E5
1660 v2 cpu, see http://sourceforge.net/p/mingw-w64/mailman/message/32493707/
mingw-w64 is trunk as of 21st of May.

testsuite results see
https://gcc.gnu.org/ml/gcc-testresults/2014-07/msg01531.html

Rainer
-BEGIN PGP SIGNATURE-
Version: GnuPG v2.0.22 (MingW32)
Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/

iQEcBAEBAgAGBQJTyVlYAAoJEB3HOsWs+KJbdacH/RSH4T+36nEG1/pSW/yUMmwV
TRhg1ZvduL8MRmLVaPJE1xOU6SdqY/GQTDTKNCsUGG+Vk7OZ6hiioVivn2fpW4NT
GC/IWxsPbvtOYc15h6bpn/xbfo3W5dQGhrsbnzx0GQIhM/Vmyzwr/wMdRMilcXPC
Qt89v8tcsS72XaPBdl6p6r1WnptyH1wWNTJs9Z+O3OELi2Wuvi3yJ1fOboTUOenQ
Ld6vts6uaO3jBbJM1JITvQ/csV620GwG7T/xmW1IbOdirf49S2/sQBLSRoDIp31L
lpbVlRHC3EVjofp0v77F3jHt1pON57EuaG09fb45f7e+v4kxq1RcBRwzBwPVf9g=
=1YOA
-END PGP SIGNATURE-


Re: What would it take to always force indirect inlining?

2014-07-18 Thread Daniel Santos


On 07/18/2014 04:55 AM, Martin Jambor wrote:

Hi,

On Thu, Jul 17, 2014 at 12:26:43PM -0500, Daniel Santos wrote:

I've recently discovered that a function marked always_inline but
called by pointer won't always be inlined. What would it take to
assure that this either always happens or generates an error?

Generally, that is the case.  Direct non-inlined calls of
always_inline functions thus indicate that the destination become
known only after inlining in the optimization pipeline.  Without more
context, especially without knowing how exactly you call my_quicksort
function, it is impossible to say why.  If you cannot povide us with
the source code (I would be interested if you can), you can try
compiling with -fdump-tree-all -fdump-ipa-all-details and go through
the generated dump files to find out at what point it becomes known.
ipa-cp and ipa-inline dumps also contain information about what
inlining context of your function.

Martin


Hello. I've given it a new home: https://github.com/daniel-santos/cmeta. 
I'm using cmake for the first time and I'm not sure that I like it. For 
now, you have to *manually* specify your CFLAGS. This is what I'm using:


CFLAGS="-std=gnu11 -march=native -g3 -pipe -Wall -Wextra -Wcast-align 
-Wno-unused-parameter -O2 -DNDEBUG"


My -march=native is a Phenom 9850 and I did this on gcc 4.8.3. I have 
4.9.0 installed, but I haven't examined that one just yet, but I will 
shortly. Also please note that this is currently not the worlds 
prettiest code! (sorry, I guess I'm vain)


Daniel


Re: Question about sysroot and fixincludes

2014-07-18 Thread Andrew Hsieh
See bug entry for more details: b.android.com/73728

On Thu, Jul 17, 2014 at 8:39 AM, Andrew Hsieh  wrote:
> Bionic headers prior to android-L (for L-preview) aren't changed
> except for bug fixes since last major update in android-9 (gingerbread
> era), the API level used to build all 32-bit NDK toolchains, so it
> would be interesting to see how fixincluded differs should we use
> different API level > 9.
>
> Since JB, bionic is overhauled but not stable enough for NDK until
> android-L.  It's very possible that fixincluded for andorid-9 is wrong
> for android-L.  Do you have example?  We can fix up bionic headers for
> all levels in NDK to make fixincluded consistent if not gone
> completely
>
> On Wed, Jul 16, 2014 at 8:08 PM, Alexander Ivchenko  
> wrote:
>> Hi, I have a question about sysroot and fixincludes.
>>
>> On Android there are different API levels (like android-9, android-10
>> etc) that match different versions of OS. Gcc from NDK is configured
>> using sysroot for android-9 and the convenient way for compiling for,
>> say, android-19 was by providing the sysroot to android-19 as a
>> command line option (--sysroot).
>>
>> However, the header from the sysroot with which gcc was configured
>> could be "fixincluded", and, when I provide a different sysroot as a
>> command line option, "fixincluded" header could replace the actual
>> header from the specified sysroot - that is the root-cause of certain
>> problems.
>>
>> Should search in 'include-fixed' be disabled when sysroot command line
>> option is specified?
>>
>> --Alexander
>
>
>
> --
> Thanks,
> Andrew



-- 
Thanks,
Andrew