> Ohh... I used to think I know something about optimization. I tried build > Rasmus' 'find_last_bit' against mine on MIPS, and found that sizes are as > 268 vs 320 bytes. I think the best version is suggested by George, both > readable, and effective. What about using it? If no objections, I'll > gather all fixes and upload new patchset this weekend.
I'll happily ack whichever you prefer. Tightening the code to the maximum possible fun exercise, but not essential. However, I finally got GCC to generate reasonable code with the following: size_t find_last_bit3(const unsigned long *addr, size_t size) { if (size) { unsigned long val = LAST_WORD_MASK(size); size_t idx = (size-1) / BITS_PER_LONG; do { val &= addr[idx]; if (val) return idx * BITS_PER_LONG + __fls(val); val = ~0ul; } while (idx--); } return size; } size_t find_last_bit4(const unsigned long *addr, size_t size) { if (size) { unsigned long val = LAST_WORD_MASK(size); size_t idx = DIV_ROUND_UP(size, BITS_PER_LONG); do { val &= addr[idx]; if (val) return idx * BITS_PER_LONG + __fls(val); val = ~0ul; } while (--idx); } return size; } Which produced, respectively, 76 and 68-byte functions: 0000000000000000 <find_last_bit3>: 0: 31 c0 xor %eax,%eax 2: 48 85 f6 test %rsi,%rsi 5: 74 44 je 4b <find_last_bit3+0x4b> 7: 48 8d 46 ff lea -0x1(%rsi),%rax b: 89 f1 mov %esi,%ecx d: 48 c7 c2 ff ff ff ff mov $0xffffffffffffffff,%rdx 14: f7 d9 neg %ecx 16: 48 c1 e8 06 shr $0x6,%rax 1a: 48 d3 ea shr %cl,%rdx 1d: eb 11 jmp 30 <find_last_bit3+0x30> 1f: 90 nop 20: 48 83 e8 01 sub $0x1,%rax 24: 48 c7 c2 ff ff ff ff mov $0xffffffffffffffff,%rdx 2b: 48 39 d0 cmp %rdx,%rax 2e: 74 18 je 48 <find_last_bit3+0x48> 30: 48 23 14 c7 and (%rdi,%rax,8),%rdx 34: 74 ea je 20 <find_last_bit3+0x20> 36: 48 c1 e0 06 shl $0x6,%rax 3a: 48 0f bd d2 bsr %rdx,%rdx 3e: 48 01 d0 add %rdx,%rax 41: c3 retq 42: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 48: 48 89 f0 mov %rsi,%rax 4b: c3 retq 4c: 0f 1f 40 00 nopl 0x0(%rax) 0000000000000050 <find_last_bit4>: 50: 31 c0 xor %eax,%eax 52: 48 85 f6 test %rsi,%rsi 55: 74 3c je 93 <find_last_bit4+0x43> 57: 48 8d 46 3f lea 0x3f(%rsi),%rax 5b: 89 f1 mov %esi,%ecx 5d: 48 c7 c2 ff ff ff ff mov $0xffffffffffffffff,%rdx 64: f7 d9 neg %ecx 66: 48 c1 e8 06 shr $0x6,%rax 6a: 48 d3 ea shr %cl,%rdx 6d: eb 0d jmp 7c <find_last_bit4+0x2c> 6f: 90 nop 70: 48 c7 c2 ff ff ff ff mov $0xffffffffffffffff,%rdx 77: 48 01 d0 add %rdx,%rax 7a: 74 14 je 90 <find_last_bit4+0x40> 7c: 48 23 14 c7 and (%rdi,%rax,8),%rdx 80: 74 ee je 70 <find_last_bit4+0x20> 82: 48 c1 e0 06 shl $0x6,%rax 86: 48 0f bd d2 bsr %rdx,%rdx 8a: 48 01 d0 add %rdx,%rax 8d: c3 retq 8e: 66 90 xchg %ax,%ax 90: 48 89 f0 mov %rsi,%rax 93: c3 retq The second one omits all the unwanted tests & compares, and even does something clever with the -1 mask that I didn't think of. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/