On Fri, Apr 16, 2021 at 10:11 PM Eric Dumazet <eduma...@google.com> wrote: > > On Fri, Apr 16, 2021 at 9:44 PM Al Viro <v...@zeniv.linux.org.uk> wrote: > > > > On Fri, Apr 16, 2021 at 12:24:13PM -0700, Eric Dumazet wrote: > > > From: Eric Dumazet <eduma...@google.com> > > > > > > We have to loop only to copy u64 values. > > > After this first loop, we copy at most one u32, one u16 and one byte. > > > > Does it actually yield a better code? > > > > Yes, my patch gives a better code, on actual kernel use-case > > (net-next tree, look at put_cmsg()) > > 5ca: 48 89 0f mov %rcx,(%rdi) > 5cd: 89 77 08 mov %esi,0x8(%rdi) > 5d0: 89 57 0c mov %edx,0xc(%rdi) > 5d3: 48 83 c7 10 add $0x10,%rdi > 5d7: 48 83 c1 f0 add $0xfffffffffffffff0,%rcx > 5db: 48 83 f9 07 cmp $0x7,%rcx > 5df: 76 40 jbe 621 <put_cmsg+0x111> > 5e1: 66 66 66 66 66 66 2e data16 data16 data16 data16 data16 nopw > %cs:0x0(%rax,%rax,1) > 5e8: 0f 1f 84 00 00 00 00 > 5ef: 00 > 5f0: 49 8b 10 mov (%r8),%rdx > 5f3: 48 89 17 mov %rdx,(%rdi) > 5f6: 48 83 c7 08 add $0x8,%rdi > 5fa: 49 83 c0 08 add $0x8,%r8 > 5fe: 48 83 c1 f8 add $0xfffffffffffffff8,%rcx > 602: 48 83 f9 07 cmp $0x7,%rcx > 606: 77 e8 ja 5f0 <put_cmsg+0xe0> > 608: eb 17 jmp 621 <put_cmsg+0x111> > 60a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) > 610: 41 8b 10 mov (%r8),%edx > 613: 89 17 mov %edx,(%rdi) > 615: 48 83 c7 04 add $0x4,%rdi > 619: 49 83 c0 04 add $0x4,%r8 > 61d: 48 83 c1 fc add $0xfffffffffffffffc,%rcx > 621: 48 83 f9 03 cmp $0x3,%rcx > 625: 77 e9 ja 610 <put_cmsg+0x100> > 627: eb 1a jmp 643 <put_cmsg+0x133> > 629: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) > 630: 41 0f b7 10 movzwl (%r8),%edx > 634: 66 89 17 mov %dx,(%rdi) > 637: 48 83 c7 02 add $0x2,%rdi > 63b: 49 83 c0 02 add $0x2,%r8 > 63f: 48 83 c1 fe add $0xfffffffffffffffe,%rcx > 643: 48 83 f9 01 cmp $0x1,%rcx > 647: 77 e7 ja 630 <put_cmsg+0x120> > 649: eb 15 jmp 660 <put_cmsg+0x150> > 64b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) > 650: 41 0f b6 08 movzbl (%r8),%ecx > 654: 88 0f mov %cl,(%rdi) > 656: 48 83 c7 01 add $0x1,%rdi > 65a: 49 83 c0 01 add $0x1,%r8 > 65e: 31 c9 xor %ecx,%ecx > 660: 48 85 c9 test %rcx,%rcx > 663: 75 eb jne 650 <put_cmsg+0x140>
After the change code is now what we would expect (no jmp around) 5db: 48 83 f9 08 cmp $0x8,%rcx 5df: 72 27 jb 608 <put_cmsg+0xf8> 5e1: 66 66 66 66 66 66 2e data16 data16 data16 data16 data16 nopw %cs:0x0(%rax,%rax,1) 5e8: 0f 1f 84 00 00 00 00 5ef: 00 5f0: 49 8b 10 mov (%r8),%rdx 5f3: 48 89 17 mov %rdx,(%rdi) 5f6: 48 83 c7 08 add $0x8,%rdi 5fa: 49 83 c0 08 add $0x8,%r8 5fe: 48 83 c1 f8 add $0xfffffffffffffff8,%rcx 602: 48 83 f9 08 cmp $0x8,%rcx 606: 73 e8 jae 5f0 <put_cmsg+0xe0> 608: 48 83 f9 04 cmp $0x4,%rcx 60c: 72 11 jb 61f <put_cmsg+0x10f> 60e: 41 8b 10 mov (%r8),%edx 611: 89 17 mov %edx,(%rdi) 613: 48 83 c7 04 add $0x4,%rdi 617: 49 83 c0 04 add $0x4,%r8 61b: 48 83 c1 fc add $0xfffffffffffffffc,%rcx 61f: 48 83 f9 02 cmp $0x2,%rcx 623: 72 13 jb 638 <put_cmsg+0x128> 625: 41 0f b7 10 movzwl (%r8),%edx 629: 66 89 17 mov %dx,(%rdi) 62c: 48 83 c7 02 add $0x2,%rdi 630: 49 83 c0 02 add $0x2,%r8 634: 48 83 c1 fe add $0xfffffffffffffffe,%rcx 638: 48 85 c9 test %rcx,%rcx 63b: 74 05 je 642 <put_cmsg+0x132> 63d: 41 8a 08 mov (%r8),%cl 640: 88 0f mov %cl,(%rdi) As I said, its minor, I am sure you can come up to something much better ! Thanks ! > > > > FWIW, this > > void bar(unsigned); > > void foo(unsigned n) > > { > > while (n >= 8) { > > bar(n); > > n -= 8; > > } > > while (n >= 4) { > > bar(n); > > n -= 4; > > } > > while (n >= 2) { > > bar(n); > > n -= 2; > > } > > while (n >= 1) { > > bar(n); > > n -= 1; > > } > > } > > > > will compile (with -O2) to > > pushq %rbp > > pushq %rbx > > movl %edi, %ebx > > subq $8, %rsp > > cmpl $7, %edi > > jbe .L2 > > movl %edi, %ebp > > .L3: > > movl %ebp, %edi > > subl $8, %ebp > > call bar@PLT > > cmpl $7, %ebp > > ja .L3 > > andl $7, %ebx > > .L2: > > cmpl $3, %ebx > > jbe .L4 > > movl %ebx, %edi > > andl $3, %ebx > > call bar@PLT > > .L4: > > cmpl $1, %ebx > > jbe .L5 > > movl %ebx, %edi > > andl $1, %ebx > > call bar@PLT > > .L5: > > testl %ebx, %ebx > > je .L1 > > addq $8, %rsp > > movl $1, %edi > > popq %rbx > > popq %rbp > > jmp bar@PLT > > .L1: > > addq $8, %rsp > > popq %rbx > > popq %rbp > > ret > > > > i.e. loop + if + if + if...