I've recently discovered that a function marked always_inline but called by pointer won't always be inlined. What would it take to assure that this either always happens or generates an error? Unfortunately, it's breaking (well, failing to properly optimize) some code where I need the optimizer to see what's in the inline function (which is a constant at compile time) so it can optimize it into a REP MOVSx loop on x86 or similar on other archs. I kinda designed the function so that it would work that way, but it ends up making a function call and then can't optimize any further.


static __always_inline void my_copy(const struct qsort_def *def, void *dest, const void *src) {
        const struct size_type __aligned(ALIGN_SIZE) *s = src;
        struct size_type __aligned(ALIGN_SIZE) *d = dest;
//        fprintf(stderr, "copy: d=%p, s=%p\n", d, s);
        *d = *s;
0000000000000020 <my_copy> mov    (%rdx),%rax
0000000000000023 <my_copy+0x3> mov    %rax,(%rsi)
0000000000000026 <my_copy+0x6> retq

...

static __always_inline __flatten void
_quicksort_ror(const struct qsort_def *def, void *left, void *right, void *tmp, size_t tmp_size) {
        const size_t size = def->size;
        char *r = right;
        char *l = left;
const ssize_t dist = (r - l) / (ssize_t)def->size; /* left to right offset */
00000000000003c1 <my_quicksort.isra.0+0x221> sub %rbx,%rdx
00000000000003c4 <my_quicksort.isra.0+0x224> test %rdx,%rdx
00000000000003c7 <my_quicksort.isra.0+0x227> lea 0x7(%rdx),%r12
00000000000003cb <my_quicksort.isra.0+0x22b> cmovns %rdx,%r12

        if (size <= tmp_size) {
                ssize_t i;
                char *left_minus_one = l - size;

                def->copy(def, tmp, r);
00000000000003cf <my_quicksort.isra.0+0x22f> mov %r13,%rdx
static __always_inline __flatten void
_quicksort_ror(const struct qsort_def *def, void *left, void *right, void *tmp, size_t tmp_size) {
        const size_t size = def->size;
        char *r = right;
        char *l = left;
const ssize_t dist = (r - l) / (ssize_t)def->size; /* left to right offset */
00000000000003d2 <my_quicksort.isra.0+0x232> sar $0x3,%r12

        if (size <= tmp_size) {
                ssize_t i;
                char *left_minus_one = l - size;

                def->copy(def, tmp, r);
00000000000003d6 <my_quicksort.isra.0+0x236> callq 0000000000000020 <my_copy>
                /* rep movs-friendly loop */
                for (i = dist; i; --i) {
00000000000003db <my_quicksort.isra.0+0x23b> test %r12,%r12
00000000000003de <my_quicksort.isra.0+0x23e> je 000000000000041d <my_quicksort.isra.0+0x27d>
00000000000003e0 <my_quicksort.isra.0+0x240> lea 0x0(,%r12,8),%rdx
00000000000003e8 <my_quicksort.isra.0+0x248> lea (%rbx,%rdx,1),%r14
00000000000003ec <my_quicksort.isra.0+0x24c> add %rdx,%r15
00000000000003ef <my_quicksort.isra.0+0x24f> xchg   %ax,%ax
00000000000003f1 <my_quicksort.isra.0+0x251> data32 data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1) def->copy(def, &l[i * size], &left_minus_one[i * size]);
0000000000000400 <my_quicksort.isra.0+0x260> mov %r15,%rdx
0000000000000403 <my_quicksort.isra.0+0x263> mov %r14,%rsi
0000000000000406 <my_quicksort.isra.0+0x266> mov $0x0,%edi
                        407: R_X86_64_32        .rodata+0x20
000000000000040b <my_quicksort.isra.0+0x26b> callq 0000000000000020 <my_copy>
0000000000000410 <my_quicksort.isra.0+0x270> sub $0x8,%r14
0000000000000414 <my_quicksort.isra.0+0x274> sub $0x8,%r15
                ssize_t i;
                char *left_minus_one = l - size;

                def->copy(def, tmp, r);
                /* rep movs-friendly loop */
                for (i = dist; i; --i) {
0000000000000418 <my_quicksort.isra.0+0x278> dec    %r12
000000000000041b <my_quicksort.isra.0+0x27b> jne 0000000000000400 <my_quicksort.isra.0+0x260> def->copy(def, &l[i * size], &left_minus_one[i * size]);
                }

                def->copy(def, left, tmp);
000000000000041d <my_quicksort.isra.0+0x27d> mov -0x450(%rbp),%rdx
0000000000000424 <my_quicksort.isra.0+0x284> mov %rbx,%rsi
0000000000000427 <my_quicksort.isra.0+0x287> mov $0x0,%edi
                        428: R_X86_64_32        .rodata+0x20
000000000000042c <my_quicksort.isra.0+0x28c> callq 0000000000000020 <my_copy> 0000000000000431 <my_quicksort.isra.0+0x291> jmpq 0000000000000378 <my_quicksort.isra.0+0x1d8>



If the optimizer had the body of my_copy above, it should be able to use two pointers (one for l and another for left_minus_one) and a single index as long as size is either 1, 2, 4 or 8. All and all, I need to refine my strategy, but if I can solve this little part, it will help greatly.

Thanks,
Daniel

Reply via email to