Hi,

On 08/05/2020 02:03 PM, Segher Boessenkool wrote:
Hi!

On Wed, Aug 05, 2020 at 07:09:23AM +0000, Christophe Leroy wrote:
+/*
+ * powerpc specific delta calculation.
+ *
+ * This variant removes the masking of the subtraction because the
+ * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
+ * which would result in a pointless operation. The compiler cannot
+ * optimize it away as the mask comes from the vdso data and is not compile
+ * time constant.
+ */

It cannot optimise it because it does not know shift < 32.  The code
below is incorrect for shift equal to 32, fwiw.

Is there a way to tell it ?


+static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 
mult)
+{
+       return (cycles - last) * mult;
+}
+#define vdso_calc_delta vdso_calc_delta
+
+#ifndef __powerpc64__
+static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
+{
+       u32 hi = ns >> 32;
+       u32 lo = ns;
+
+       lo >>= shift;
+       lo |= hi << (32 - shift);
+       hi >>= shift;


+       if (likely(hi == 0))
+               return lo;

Removing these two lines shouldn't change generated object code?  Or not
make it worse, at least.

I remember it made noticeable difference allthough I can't remember the details. See below with GCC 10.1. At least we see that with those two lines, GCC only sets a 16 bytes stack frame. Without those lines it sets a 32 bytes stack frame and seems to save some values for no reason.

With the two lines:

000006ac <__c_kernel_clock_gettime>:
 6ac:   28 03 00 0f     cmplwi  r3,15
 6b0:   41 81 01 04     bgt     7b4 <__c_kernel_clock_gettime+0x108>
 6b4:   39 40 00 01     li      r10,1
 6b8:   7d 4a 18 30     slw     r10,r10,r3
 6bc:   71 47 08 83     andi.   r7,r10,2179
 6c0:   41 82 01 2c     beq     7ec <__c_kernel_clock_gettime+0x140>
 6c4:   94 21 ff f0     stwu    r1,-16(r1)
 6c8:   54 63 20 36     rlwinm  r3,r3,4,0,27
 6cc:   93 e1 00 0c     stw     r31,12(r1)
 6d0:   7d 85 1a 14     add     r12,r5,r3
 6d4:   80 05 00 00     lwz     r0,0(r5)
 6d8:   70 06 00 01     andi.   r6,r0,1
 6dc:   40 82 00 d4     bne     7b0 <__c_kernel_clock_gettime+0x104>
 6e0:   7d 4d 42 e6     mftbu   r10
 6e4:   7d 6c 42 e6     mftb    r11
 6e8:   7c ed 42 e6     mftbu   r7
 6ec:   7c 0a 38 40     cmplw   r10,r7
 6f0:   40 82 ff f0     bne     6e0 <__c_kernel_clock_gettime+0x34>
 6f4:   80 e5 00 0c     lwz     r7,12(r5)
 6f8:   80 65 00 08     lwz     r3,8(r5)
 6fc:   7c e7 58 10     subfc   r7,r7,r11
 700:   81 65 00 18     lwz     r11,24(r5)
 704:   7d 43 51 10     subfe   r10,r3,r10
 708:   7f e7 58 16     mulhwu  r31,r7,r11
 70c:   7d 4a 59 d6     mullw   r10,r10,r11
 710:   7c e7 59 d6     mullw   r7,r7,r11
 714:   80 6c 00 2c     lwz     r3,44(r12)
 718:   81 6c 00 28     lwz     r11,40(r12)
 71c:   7c e7 18 14     addc    r7,r7,r3
 720:   7d 4a fa 14     add     r10,r10,r31
 724:   80 65 00 1c     lwz     r3,28(r5)
 728:   7d 4a 59 14     adde    r10,r10,r11
 72c:   7c e7 1c 30     srw     r7,r7,r3
 730:   21 63 00 20     subfic  r11,r3,32
 734:   7d 43 1c 31     srw.    r3,r10,r3
 738:   7d 4a 58 30     slw     r10,r10,r11
 73c:   7d 49 3b 78     or      r9,r10,r7
 740:   39 00 00 00     li      r8,0
 744:   40 82 00 84     bne     7c8 <__c_kernel_clock_gettime+0x11c>
 748:   80 6c 00 24     lwz     r3,36(r12)
 74c:   81 45 00 00     lwz     r10,0(r5)
 750:   7c 00 50 40     cmplw   r0,r10
 754:   40 a2 ff 80     bne     6d4 <__c_kernel_clock_gettime+0x28>
 758:   2c 08 00 00     cmpwi   r8,0
 75c:   41 82 00 7c     beq     7d8 <__c_kernel_clock_gettime+0x12c>
 760:   3c e0 c4 65     lis     r7,-15259
 764:   3c 00 3b 9a     lis     r0,15258
 768:   60 e7 36 00     ori     r7,r7,13824
 76c:   60 00 c9 ff     ori     r0,r0,51711
 770:   7c a9 38 14     addc    r5,r9,r7
 774:   7d 48 01 d4     addme   r10,r8
 778:   2c 0a 00 00     cmpwi   r10,0
 77c:   7d 48 53 78     mr      r8,r10
 780:   7c a9 2b 78     mr      r9,r5
 784:   38 c6 00 01     addi    r6,r6,1
 788:   40 82 ff e8     bne     770 <__c_kernel_clock_gettime+0xc4>
 78c:   7c 05 00 40     cmplw   r5,r0
 790:   41 81 ff e0     bgt     770 <__c_kernel_clock_gettime+0xc4>
 794:   7c 66 18 14     addc    r3,r6,r3
 798:   90 64 00 00     stw     r3,0(r4)
 79c:   91 24 00 04     stw     r9,4(r4)
 7a0:   38 60 00 00     li      r3,0
 7a4:   83 e1 00 0c     lwz     r31,12(r1)
 7a8:   38 21 00 10     addi    r1,r1,16
 7ac:   4e 80 00 20     blr
 7b0:   4b ff ff 24     b       6d4 <__c_kernel_clock_gettime+0x28>
 7b4:   38 00 00 f6     li      r0,246
 7b8:   44 00 00 02     sc
 7bc:   40 a3 00 08     bns     7c4 <__c_kernel_clock_gettime+0x118>
 7c0:   7c 63 00 d0     neg     r3,r3
 7c4:   4e 80 00 20     blr
 7c8:   7d 2a 4b 78     mr      r10,r9
 7cc:   7c 68 1b 78     mr      r8,r3
 7d0:   7d 49 53 78     mr      r9,r10
 7d4:   4b ff ff 74     b       748 <__c_kernel_clock_gettime+0x9c>
 7d8:   3d 40 3b 9a     lis     r10,15258
 7dc:   61 4a c9 ff     ori     r10,r10,51711
 7e0:   7c 09 50 40     cmplw   r9,r10
 7e4:   41 81 ff 7c     bgt     760 <__c_kernel_clock_gettime+0xb4>
 7e8:   4b ff ff b0     b       798 <__c_kernel_clock_gettime+0xec>
 7ec:   71 47 00 60     andi.   r7,r10,96
 7f0:   54 69 20 36     rlwinm  r9,r3,4,0,27
 7f4:   7d 25 4a 14     add     r9,r5,r9
 7f8:   40 82 00 14     bne     80c <__c_kernel_clock_gettime+0x160>
 7fc:   71 4a 00 10     andi.   r10,r10,16
 800:   41 a2 ff b4     beq     7b4 <__c_kernel_clock_gettime+0x108>
 804:   38 a5 00 f0     addi    r5,r5,240
 808:   4b ff fe bc     b       6c4 <__c_kernel_clock_gettime+0x18>
 80c:   81 05 00 00     lwz     r8,0(r5)
 810:   71 0a 00 01     andi.   r10,r8,1
 814:   40 a2 ff f8     bne     80c <__c_kernel_clock_gettime+0x160>
 818:   80 69 00 24     lwz     r3,36(r9)
 81c:   81 49 00 2c     lwz     r10,44(r9)
 820:   80 e5 00 00     lwz     r7,0(r5)
 824:   7c 08 38 40     cmplw   r8,r7
 828:   40 a2 ff e4     bne     80c <__c_kernel_clock_gettime+0x160>
 82c:   90 64 00 00     stw     r3,0(r4)
 830:   91 44 00 04     stw     r10,4(r4)
 834:   38 60 00 00     li      r3,0
 838:   4e 80 00 20     blr


Without the two lines:

000006ac <__c_kernel_clock_gettime>:
 6ac:   28 03 00 0f     cmplwi  r3,15
 6b0:   41 81 01 14     bgt     7c4 <__c_kernel_clock_gettime+0x118>
 6b4:   39 20 00 01     li      r9,1
 6b8:   7d 29 18 30     slw     r9,r9,r3
 6bc:   71 2a 08 83     andi.   r10,r9,2179
 6c0:   41 82 01 2c     beq     7ec <__c_kernel_clock_gettime+0x140>
 6c4:   94 21 ff e0     stwu    r1,-32(r1)
 6c8:   54 63 20 36     rlwinm  r3,r3,4,0,27
 6cc:   93 81 00 10     stw     r28,16(r1)
 6d0:   93 a1 00 14     stw     r29,20(r1)
 6d4:   93 c1 00 18     stw     r30,24(r1)
 6d8:   93 e1 00 1c     stw     r31,28(r1)
 6dc:   7c 65 1a 14     add     r3,r5,r3
 6e0:   81 85 00 00     lwz     r12,0(r5)
 6e4:   71 87 00 01     andi.   r7,r12,1
 6e8:   40 82 00 d8     bne     7c0 <__c_kernel_clock_gettime+0x114>
 6ec:   7d 2d 42 e6     mftbu   r9
 6f0:   7c cc 42 e6     mftb    r6
 6f4:   7d 4d 42 e6     mftbu   r10
 6f8:   7c 09 50 40     cmplw   r9,r10
 6fc:   40 82 ff f0     bne     6ec <__c_kernel_clock_gettime+0x40>
 700:   83 83 00 28     lwz     r28,40(r3)
 704:   83 a3 00 2c     lwz     r29,44(r3)
 708:   81 65 00 08     lwz     r11,8(r5)
 70c:   81 05 00 0c     lwz     r8,12(r5)
 710:   83 c5 00 18     lwz     r30,24(r5)
 714:   83 e5 00 1c     lwz     r31,28(r5)
 718:   80 03 00 24     lwz     r0,36(r3)
 71c:   81 45 00 00     lwz     r10,0(r5)
 720:   7c 0c 50 40     cmplw   r12,r10
 724:   40 a2 ff bc     bne     6e0 <__c_kernel_clock_gettime+0x34>
 728:   7d 48 30 10     subfc   r10,r8,r6
 72c:   7c cb 49 10     subfe   r6,r11,r9
 730:   7c c6 f1 d6     mullw   r6,r6,r30
 734:   7d 2a f0 16     mulhwu  r9,r10,r30
 738:   7d 4a f1 d6     mullw   r10,r10,r30
 73c:   7c c6 4a 14     add     r6,r6,r9
 740:   7d 4a e8 14     addc    r10,r10,r29
 744:   7c c6 e1 14     adde    r6,r6,r28
 748:   7c c8 fc 30     srw     r8,r6,r31
 74c:   2c 08 00 00     cmpwi   r8,0
 750:   20 bf 00 20     subfic  r5,r31,32
 754:   7d 4a fc 30     srw     r10,r10,r31
 758:   7c c5 28 30     slw     r5,r6,r5
 75c:   7c a9 53 78     or      r9,r5,r10
 760:   41 82 00 78     beq     7d8 <__c_kernel_clock_gettime+0x12c>
 764:   3c c0 c4 65     lis     r6,-15259
 768:   3c 60 3b 9a     lis     r3,15258
 76c:   60 c6 36 00     ori     r6,r6,13824
 770:   60 63 c9 ff     ori     r3,r3,51711
 774:   7c a9 30 14     addc    r5,r9,r6
 778:   7d 48 01 d4     addme   r10,r8
 77c:   2c 0a 00 00     cmpwi   r10,0
 780:   7d 48 53 78     mr      r8,r10
 784:   7c a9 2b 78     mr      r9,r5
 788:   38 e7 00 01     addi    r7,r7,1
 78c:   40 82 ff e8     bne     774 <__c_kernel_clock_gettime+0xc8>
 790:   7c 05 18 40     cmplw   r5,r3
 794:   41 81 ff e0     bgt     774 <__c_kernel_clock_gettime+0xc8>
 798:   7c 07 00 14     addc    r0,r7,r0
 79c:   90 04 00 00     stw     r0,0(r4)
 7a0:   91 24 00 04     stw     r9,4(r4)
 7a4:   38 60 00 00     li      r3,0
 7a8:   83 81 00 10     lwz     r28,16(r1)
 7ac:   83 a1 00 14     lwz     r29,20(r1)
 7b0:   83 c1 00 18     lwz     r30,24(r1)
 7b4:   83 e1 00 1c     lwz     r31,28(r1)
 7b8:   38 21 00 20     addi    r1,r1,32
 7bc:   4e 80 00 20     blr
 7c0:   4b ff ff 20     b       6e0 <__c_kernel_clock_gettime+0x34>
 7c4:   38 00 00 f6     li      r0,246
 7c8:   44 00 00 02     sc
 7cc:   40 a3 00 08     bns     7d4 <__c_kernel_clock_gettime+0x128>
 7d0:   7c 63 00 d0     neg     r3,r3
 7d4:   4e 80 00 20     blr
 7d8:   3d 40 3b 9a     lis     r10,15258
 7dc:   61 4a c9 ff     ori     r10,r10,51711
 7e0:   7c 09 50 40     cmplw   r9,r10
 7e4:   41 81 ff 80     bgt     764 <__c_kernel_clock_gettime+0xb8>
 7e8:   4b ff ff b4     b       79c <__c_kernel_clock_gettime+0xf0>
 7ec:   71 2a 00 60     andi.   r10,r9,96
 7f0:   40 82 00 14     bne     804 <__c_kernel_clock_gettime+0x158>
 7f4:   71 29 00 10     andi.   r9,r9,16
 7f8:   41 a2 ff cc     beq     7c4 <__c_kernel_clock_gettime+0x118>
 7fc:   38 a5 00 f0     addi    r5,r5,240
 800:   4b ff fe c4     b       6c4 <__c_kernel_clock_gettime+0x18>
 804:   54 69 20 36     rlwinm  r9,r3,4,0,27
 808:   7d 25 4a 14     add     r9,r5,r9
 80c:   81 05 00 00     lwz     r8,0(r5)
 810:   71 0a 00 01     andi.   r10,r8,1
 814:   40 82 00 28     bne     83c <__c_kernel_clock_gettime+0x190>
 818:   80 09 00 24     lwz     r0,36(r9)
 81c:   81 49 00 2c     lwz     r10,44(r9)
 820:   80 e5 00 00     lwz     r7,0(r5)
 824:   7c 08 38 40     cmplw   r8,r7
 828:   40 a2 ff e4     bne     80c <__c_kernel_clock_gettime+0x160>
 82c:   90 04 00 00     stw     r0,0(r4)
 830:   91 44 00 04     stw     r10,4(r4)
 834:   38 60 00 00     li      r3,0
 838:   4e 80 00 20     blr
 83c:   4b ff ff d0     b       80c <__c_kernel_clock_gettime+0x160>



+       return ((u64)hi << 32) | lo;
+}


What does the compiler do for just

static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
        return ns >> (shift & 31);
}


Worse:

000006ac <__c_kernel_clock_gettime>:
 6ac:   28 03 00 0f     cmplwi  r3,15
 6b0:   41 81 01 30     bgt     7e0 <__c_kernel_clock_gettime+0x134>
 6b4:   39 20 00 01     li      r9,1
 6b8:   7d 29 18 30     slw     r9,r9,r3
 6bc:   71 2a 08 83     andi.   r10,r9,2179
 6c0:   41 82 01 48     beq     808 <__c_kernel_clock_gettime+0x15c>
 6c4:   94 21 ff e0     stwu    r1,-32(r1)
 6c8:   54 63 20 36     rlwinm  r3,r3,4,0,27
 6cc:   93 81 00 10     stw     r28,16(r1)
 6d0:   93 a1 00 14     stw     r29,20(r1)
 6d4:   93 c1 00 18     stw     r30,24(r1)
 6d8:   93 e1 00 1c     stw     r31,28(r1)
 6dc:   7c 65 1a 14     add     r3,r5,r3
 6e0:   80 c5 00 00     lwz     r6,0(r5)
 6e4:   70 c7 00 01     andi.   r7,r6,1
 6e8:   40 82 00 f4     bne     7dc <__c_kernel_clock_gettime+0x130>
 6ec:   7d 2d 42 e6     mftbu   r9
 6f0:   7d 0c 42 e6     mftb    r8
 6f4:   7d 4d 42 e6     mftbu   r10
 6f8:   7c 09 50 40     cmplw   r9,r10
 6fc:   40 82 ff f0     bne     6ec <__c_kernel_clock_gettime+0x40>
 700:   83 83 00 28     lwz     r28,40(r3)
 704:   83 c3 00 2c     lwz     r30,44(r3)
 708:   81 65 00 08     lwz     r11,8(r5)
 70c:   81 45 00 0c     lwz     r10,12(r5)
 710:   83 e5 00 18     lwz     r31,24(r5)
 714:   81 85 00 1c     lwz     r12,28(r5)
 718:   80 03 00 24     lwz     r0,36(r3)
 71c:   83 a5 00 00     lwz     r29,0(r5)
 720:   7c 06 e8 40     cmplw   r6,r29
 724:   40 a2 ff bc     bne     6e0 <__c_kernel_clock_gettime+0x34>
 728:   7d 0a 40 10     subfc   r8,r10,r8
 72c:   7c cb 49 10     subfe   r6,r11,r9
 730:   7c c6 f9 d6     mullw   r6,r6,r31
 734:   7d 28 f8 16     mulhwu  r9,r8,r31
 738:   7d 08 f9 d6     mullw   r8,r8,r31
 73c:   55 8c 06 fe     clrlwi  r12,r12,27
 740:   7f c8 f0 14     addc    r30,r8,r30
 744:   7c c6 4a 14     add     r6,r6,r9
 748:   7c c6 e1 14     adde    r6,r6,r28
 74c:   34 6c ff e0     addic.  r3,r12,-32
 750:   41 80 00 70     blt     7c0 <__c_kernel_clock_gettime+0x114>
 754:   7c c9 1c 30     srw     r9,r6,r3
 758:   39 00 00 00     li      r8,0
 75c:   2c 08 00 00     cmpwi   r8,0
 760:   41 82 00 94     beq     7f4 <__c_kernel_clock_gettime+0x148>
 764:   3c c0 c4 65     lis     r6,-15259
 768:   3c 60 3b 9a     lis     r3,15258
 76c:   60 c6 36 00     ori     r6,r6,13824
 770:   60 63 c9 ff     ori     r3,r3,51711
 774:   7c a9 30 14     addc    r5,r9,r6
 778:   7d 48 01 d4     addme   r10,r8
 77c:   2c 0a 00 00     cmpwi   r10,0
 780:   7d 48 53 78     mr      r8,r10
 784:   7c a9 2b 78     mr      r9,r5
 788:   38 e7 00 01     addi    r7,r7,1
 78c:   40 82 ff e8     bne     774 <__c_kernel_clock_gettime+0xc8>
 790:   7c 05 18 40     cmplw   r5,r3
 794:   41 81 ff e0     bgt     774 <__c_kernel_clock_gettime+0xc8>
 798:   7c 07 00 14     addc    r0,r7,r0
 79c:   90 04 00 00     stw     r0,0(r4)
 7a0:   91 24 00 04     stw     r9,4(r4)
 7a4:   38 60 00 00     li      r3,0
 7a8:   83 81 00 10     lwz     r28,16(r1)
 7ac:   83 a1 00 14     lwz     r29,20(r1)
 7b0:   83 c1 00 18     lwz     r30,24(r1)
 7b4:   83 e1 00 1c     lwz     r31,28(r1)
 7b8:   38 21 00 20     addi    r1,r1,32
 7bc:   4e 80 00 20     blr
 7c0:   54 c3 08 3c     rlwinm  r3,r6,1,0,30
 7c4:   21 6c 00 1f     subfic  r11,r12,31
 7c8:   7c 63 58 30     slw     r3,r3,r11
 7cc:   7f c9 64 30     srw     r9,r30,r12
 7d0:   7c 69 4b 78     or      r9,r3,r9
 7d4:   7c c8 64 30     srw     r8,r6,r12
 7d8:   4b ff ff 84     b       75c <__c_kernel_clock_gettime+0xb0>
 7dc:   4b ff ff 04     b       6e0 <__c_kernel_clock_gettime+0x34>
 7e0:   38 00 00 f6     li      r0,246
 7e4:   44 00 00 02     sc
 7e8:   40 a3 00 08     bns     7f0 <__c_kernel_clock_gettime+0x144>
 7ec:   7c 63 00 d0     neg     r3,r3
 7f0:   4e 80 00 20     blr
 7f4:   3d 40 3b 9a     lis     r10,15258
 7f8:   61 4a c9 ff     ori     r10,r10,51711
 7fc:   7c 09 50 40     cmplw   r9,r10
 800:   41 81 ff 64     bgt     764 <__c_kernel_clock_gettime+0xb8>
 804:   4b ff ff 98     b       79c <__c_kernel_clock_gettime+0xf0>
 808:   71 2a 00 60     andi.   r10,r9,96
 80c:   40 82 00 14     bne     820 <__c_kernel_clock_gettime+0x174>
 810:   71 29 00 10     andi.   r9,r9,16
 814:   41 a2 ff cc     beq     7e0 <__c_kernel_clock_gettime+0x134>
 818:   38 a5 00 f0     addi    r5,r5,240
 81c:   4b ff fe a8     b       6c4 <__c_kernel_clock_gettime+0x18>
 820:   54 69 20 36     rlwinm  r9,r3,4,0,27
 824:   7d 25 4a 14     add     r9,r5,r9
 828:   81 05 00 00     lwz     r8,0(r5)
 82c:   71 0a 00 01     andi.   r10,r8,1
 830:   40 82 00 28     bne     858 <__c_kernel_clock_gettime+0x1ac>
 834:   80 09 00 24     lwz     r0,36(r9)
 838:   81 49 00 2c     lwz     r10,44(r9)
 83c:   80 e5 00 00     lwz     r7,0(r5)
 840:   7c 08 38 40     cmplw   r8,r7
 844:   40 a2 ff e4     bne     828 <__c_kernel_clock_gettime+0x17c>
 848:   90 04 00 00     stw     r0,0(r4)
 84c:   91 44 00 04     stw     r10,4(r4)
 850:   38 60 00 00     li      r3,0
 854:   4e 80 00 20     blr
 858:   4b ff ff d0     b       828 <__c_kernel_clock_gettime+0x17c>

Christophe

Reply via email to