Hi,

On Thu, Jul 17, 2014 at 12:26:43PM -0500, Daniel Santos wrote:
> I've recently discovered that a function marked always_inline but
> called by pointer won't always be inlined. What would it take to
> assure that this either always happens or generates an error?

Generally, that is the case.  Direct non-inlined calls of
always_inline functions thus indicate that the destination become
known only after inlining in the optimization pipeline.  Without more
context, especially without knowing how exactly you call my_quicksort
function, it is impossible to say why.  If you cannot povide us with
the source code (I would be interested if you can), you can try
compiling with -fdump-tree-all -fdump-ipa-all-details and go through
the generated dump files to find out at what point it becomes known.
ipa-cp and ipa-inline dumps also contain information about what
inlining context of your function.

Martin

> Unfortunately, it's breaking (well, failing to properly optimize)
> some code where I need the optimizer to see what's in the inline
> function (which is a constant at compile time) so it can optimize it
> into a REP MOVSx loop on x86 or similar on other archs. I kinda
> designed the function so that it would work that way, but it ends up
> making a function call and then can't optimize any further.
> 
> 
> 
> static __always_inline void my_copy(const struct qsort_def *def,
> void *dest, const void *src) {
>         const struct size_type __aligned(ALIGN_SIZE) *s = src;
>         struct size_type __aligned(ALIGN_SIZE) *d = dest;
> //        fprintf(stderr, "copy: d=%p, s=%p\n", d, s);
>         *d = *s;
> 0000000000000020 <my_copy> mov    (%rdx),%rax
> 0000000000000023 <my_copy+0x3> mov    %rax,(%rsi)
> 0000000000000026 <my_copy+0x6> retq
> 
> ...
> 
> static __always_inline __flatten void
> _quicksort_ror(const struct qsort_def *def, void *left, void *right,
> void *tmp, size_t tmp_size) {
>         const size_t size = def->size;
>         char *r = right;
>         char *l = left;
>         const ssize_t dist = (r - l) / (ssize_t)def->size; /* left
> to right offset */
> 00000000000003c1 <my_quicksort.isra.0+0x221> sub %rbx,%rdx
> 00000000000003c4 <my_quicksort.isra.0+0x224> test %rdx,%rdx
> 00000000000003c7 <my_quicksort.isra.0+0x227> lea 0x7(%rdx),%r12
> 00000000000003cb <my_quicksort.isra.0+0x22b> cmovns %rdx,%r12
> 
>         if (size <= tmp_size) {
>                 ssize_t i;
>                 char *left_minus_one = l - size;
> 
>                 def->copy(def, tmp, r);
> 00000000000003cf <my_quicksort.isra.0+0x22f> mov %r13,%rdx
> static __always_inline __flatten void
> _quicksort_ror(const struct qsort_def *def, void *left, void *right,
> void *tmp, size_t tmp_size) {
>         const size_t size = def->size;
>         char *r = right;
>         char *l = left;
>         const ssize_t dist = (r - l) / (ssize_t)def->size; /* left
> to right offset */
> 00000000000003d2 <my_quicksort.isra.0+0x232> sar $0x3,%r12
> 
>         if (size <= tmp_size) {
>                 ssize_t i;
>                 char *left_minus_one = l - size;
> 
>                 def->copy(def, tmp, r);
> 00000000000003d6 <my_quicksort.isra.0+0x236> callq 0000000000000020
> <my_copy>
>                 /* rep movs-friendly loop */
>                 for (i = dist; i; --i) {
> 00000000000003db <my_quicksort.isra.0+0x23b> test %r12,%r12
> 00000000000003de <my_quicksort.isra.0+0x23e> je 000000000000041d
> <my_quicksort.isra.0+0x27d>
> 00000000000003e0 <my_quicksort.isra.0+0x240> lea 0x0(,%r12,8),%rdx
> 00000000000003e8 <my_quicksort.isra.0+0x248> lea (%rbx,%rdx,1),%r14
> 00000000000003ec <my_quicksort.isra.0+0x24c> add %rdx,%r15
> 00000000000003ef <my_quicksort.isra.0+0x24f> xchg   %ax,%ax
> 00000000000003f1 <my_quicksort.isra.0+0x251> data32 data32 data32
> data32 data32 nopw %cs:0x0(%rax,%rax,1)
>                         def->copy(def, &l[i * size],
> &left_minus_one[i * size]);
> 0000000000000400 <my_quicksort.isra.0+0x260> mov %r15,%rdx
> 0000000000000403 <my_quicksort.isra.0+0x263> mov %r14,%rsi
> 0000000000000406 <my_quicksort.isra.0+0x266> mov $0x0,%edi
>                         407: R_X86_64_32        .rodata+0x20
> 000000000000040b <my_quicksort.isra.0+0x26b> callq 0000000000000020
> <my_copy>
> 0000000000000410 <my_quicksort.isra.0+0x270> sub $0x8,%r14
> 0000000000000414 <my_quicksort.isra.0+0x274> sub $0x8,%r15
>                 ssize_t i;
>                 char *left_minus_one = l - size;
> 
>                 def->copy(def, tmp, r);
>                 /* rep movs-friendly loop */
>                 for (i = dist; i; --i) {
> 0000000000000418 <my_quicksort.isra.0+0x278> dec    %r12
> 000000000000041b <my_quicksort.isra.0+0x27b> jne 0000000000000400
> <my_quicksort.isra.0+0x260>
>                         def->copy(def, &l[i * size],
> &left_minus_one[i * size]);
>                 }
> 
>                 def->copy(def, left, tmp);
> 000000000000041d <my_quicksort.isra.0+0x27d> mov -0x450(%rbp),%rdx
> 0000000000000424 <my_quicksort.isra.0+0x284> mov %rbx,%rsi
> 0000000000000427 <my_quicksort.isra.0+0x287> mov $0x0,%edi
>                         428: R_X86_64_32        .rodata+0x20
> 000000000000042c <my_quicksort.isra.0+0x28c> callq 0000000000000020
> <my_copy>
> 0000000000000431 <my_quicksort.isra.0+0x291> jmpq 0000000000000378
> <my_quicksort.isra.0+0x1d8>
> 
> 
> 
> If the optimizer had the body of my_copy above, it should be able to
> use two pointers (one for l and another for left_minus_one) and a
> single index as long as size is either 1, 2, 4 or 8.  All and all, I
> need to refine my strategy, but if I can solve this little part, it
> will help greatly.
> 
> Thanks,
> Daniel

Reply via email to