Re: [PATCH v3 1/3] lib: find_*_bit reimplementation

George Spelvin Fri, 13 Feb 2015 13:10:25 -0800

> Ohh... I used to think I know something about optimization. I tried build
> Rasmus' 'find_last_bit' against mine on MIPS, and found that sizes are as
> 268 vs 320 bytes. I think the best version is suggested by George, both
> readable, and effective. What about using it? If no objections, I'll
> gather all fixes and upload new patchset this weekend.


I'll happily ack whichever you prefer.  Tightening the code to the
maximum possible fun exercise, but not essential.  However, I finally
got GCC to generate reasonable code with the following:

size_t find_last_bit3(const unsigned long *addr, size_t size)
{
        if (size) {
                unsigned long val = LAST_WORD_MASK(size);
                size_t idx = (size-1) / BITS_PER_LONG;

                do {
                        val &= addr[idx];
                        if (val)
                                return idx * BITS_PER_LONG + __fls(val);
                        val = ~0ul;
                } while (idx--);
        }
        return size;
}

size_t find_last_bit4(const unsigned long *addr, size_t size)
{
        if (size) {
                unsigned long val = LAST_WORD_MASK(size);
                size_t idx = DIV_ROUND_UP(size, BITS_PER_LONG);

                do {
                        val &= addr[idx];
                        if (val)
                                return idx * BITS_PER_LONG + __fls(val);
                        val = ~0ul;
                } while (--idx);
        }
        return size;
}

Which produced, respectively, 76 and 68-byte functions:

0000000000000000 <find_last_bit3>:
   0:   31 c0                   xor    %eax,%eax
   2:   48 85 f6                test   %rsi,%rsi
   5:   74 44                   je     4b <find_last_bit3+0x4b>
   7:   48 8d 46 ff             lea    -0x1(%rsi),%rax
   b:   89 f1                   mov    %esi,%ecx
   d:   48 c7 c2 ff ff ff ff    mov    $0xffffffffffffffff,%rdx
  14:   f7 d9                   neg    %ecx
  16:   48 c1 e8 06             shr    $0x6,%rax
  1a:   48 d3 ea                shr    %cl,%rdx
  1d:   eb 11                   jmp    30 <find_last_bit3+0x30>
  1f:   90                      nop
  20:   48 83 e8 01             sub    $0x1,%rax
  24:   48 c7 c2 ff ff ff ff    mov    $0xffffffffffffffff,%rdx
  2b:   48 39 d0                cmp    %rdx,%rax
  2e:   74 18                   je     48 <find_last_bit3+0x48>
  30:   48 23 14 c7             and    (%rdi,%rax,8),%rdx
  34:   74 ea                   je     20 <find_last_bit3+0x20>
  36:   48 c1 e0 06             shl    $0x6,%rax
  3a:   48 0f bd d2             bsr    %rdx,%rdx
  3e:   48 01 d0                add    %rdx,%rax
  41:   c3                      retq   
  42:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  48:   48 89 f0                mov    %rsi,%rax
  4b:   c3                      retq   
  4c:   0f 1f 40 00             nopl   0x0(%rax)

0000000000000050 <find_last_bit4>:
  50:   31 c0                   xor    %eax,%eax
  52:   48 85 f6                test   %rsi,%rsi
  55:   74 3c                   je     93 <find_last_bit4+0x43>
  57:   48 8d 46 3f             lea    0x3f(%rsi),%rax
  5b:   89 f1                   mov    %esi,%ecx
  5d:   48 c7 c2 ff ff ff ff    mov    $0xffffffffffffffff,%rdx
  64:   f7 d9                   neg    %ecx
  66:   48 c1 e8 06             shr    $0x6,%rax
  6a:   48 d3 ea                shr    %cl,%rdx
  6d:   eb 0d                   jmp    7c <find_last_bit4+0x2c>
  6f:   90                      nop
  70:   48 c7 c2 ff ff ff ff    mov    $0xffffffffffffffff,%rdx
  77:   48 01 d0                add    %rdx,%rax
  7a:   74 14                   je     90 <find_last_bit4+0x40>
  7c:   48 23 14 c7             and    (%rdi,%rax,8),%rdx
  80:   74 ee                   je     70 <find_last_bit4+0x20>
  82:   48 c1 e0 06             shl    $0x6,%rax
  86:   48 0f bd d2             bsr    %rdx,%rdx
  8a:   48 01 d0                add    %rdx,%rax
  8d:   c3                      retq   
  8e:   66 90                   xchg   %ax,%ax
  90:   48 89 f0                mov    %rsi,%rax
  93:   c3                      retq   

The second one omits all the unwanted tests & compares, and even does
something clever with the -1 mask that I didn't think of.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 1/3] lib: find_*_bit reimplementation

Reply via email to