Le 10/03/2023 à 19:26, Christophe Leroy a écrit :
> 
> 
> Le 09/03/2023 à 19:02, Hari Bathini a écrit :
>> patch_instruction() entails setting up pte, patching the instruction,
>> clearing the pte and flushing the tlb. If multiple instructions need
>> to be patched, every instruction would have to go through the above
>> drill unnecessarily. Instead, introduce function patch_instructions()
>> that patches multiple instructions at one go while setting up the pte,
>> clearing the pte and flushing the tlb only once per page range of
>> instructions. Observed ~5X improvement in speed of execution using
>> patch_instructions() over patch_instructions(), when more instructions
>> are to be patched.
> 
> I get a 13% degradation on the time needed to activate ftrace on a 
> powerpc 8xx.
> 
> Before your patch, activation ftrace takes 550k timebase ticks. After 
> your patch it takes 620k timebase ticks.
> 

More details about the problem:

Before your patch, patch_instruction() is a simple, stackless function 
(Note that the first branch is noped out after startup).

00000254 <patch_instruction>:
  254:  48 00 00 6c     b       2c0 <patch_instruction+0x6c>
  258:  7c e0 00 a6     mfmsr   r7
  25c:  7c 51 13 a6     mtspr   81,r2
  260:  3d 40 00 00     lis     r10,0
                        262: R_PPC_ADDR16_HA    .data
  264:  39 4a 00 00     addi    r10,r10,0
                        266: R_PPC_ADDR16_LO    .data
  268:  7c 69 1b 78     mr      r9,r3
  26c:  3d 29 40 00     addis   r9,r9,16384
  270:  81 0a 00 08     lwz     r8,8(r10)
  274:  55 29 00 26     rlwinm  r9,r9,0,0,19
  278:  81 4a 00 04     lwz     r10,4(r10)
  27c:  61 29 01 25     ori     r9,r9,293
  280:  91 28 00 00     stw     r9,0(r8)
  284:  55 49 00 26     rlwinm  r9,r10,0,0,19
  288:  50 6a 05 3e     rlwimi  r10,r3,0,20,31
  28c:  90 8a 00 00     stw     r4,0(r10)
  290:  7c 00 50 6c     dcbst   0,r10
  294:  7c 00 04 ac     hwsync
  298:  7c 00 1f ac     icbi    0,r3
  29c:  7c 00 04 ac     hwsync
  2a0:  4c 00 01 2c     isync
  2a4:  38 60 00 00     li      r3,0
  2a8:  39 40 00 00     li      r10,0
  2ac:  91 48 00 00     stw     r10,0(r8)
  2b0:  7c 00 4a 64     tlbie   r9,r0
  2b4:  7c 00 04 ac     hwsync
  2b8:  7c e0 01 24     mtmsr   r7
  2bc:  4e 80 00 20     blr

  2c0:  90 83 00 00     stw     r4,0(r3)
  2c4:  7c 00 18 6c     dcbst   0,r3
  2c8:  7c 00 04 ac     hwsync
  2cc:  7c 00 1f ac     icbi    0,r3
  2d0:  7c 00 04 ac     hwsync
  2d4:  4c 00 01 2c     isync
  2d8:  38 60 00 00     li      r3,0
  2dc:  4e 80 00 20     blr
  2e0:  38 60 ff ff     li      r3,-1
  2e4:  4b ff ff c4     b       2a8 <patch_instruction+0x54>
  2e8:  38 60 ff ff     li      r3,-1
  2ec:  4e 80 00 20     blr


Once your patch is there, patch_instruction() becomes a function that 
has to step up a stack in order to call __do_patch_instructions().
And __do_patch_instructions() is quite a big function.

0000022c <__do_patch_instructions>:
  22c:  3d 20 00 00     lis     r9,0
                        22e: R_PPC_ADDR16_HA    .data
  230:  39 29 00 00     addi    r9,r9,0
                        232: R_PPC_ADDR16_LO    .data
  234:  81 69 00 04     lwz     r11,4(r9)
  238:  2c 05 00 00     cmpwi   r5,0
  23c:  81 89 00 08     lwz     r12,8(r9)
  240:  7c c3 32 14     add     r6,r3,r6
  244:  55 6b 00 26     rlwinm  r11,r11,0,0,19
  248:  38 00 00 00     li      r0,0
  24c:  54 6a 05 3e     clrlwi  r10,r3,20
  250:  21 0a 10 00     subfic  r8,r10,4096
  254:  7d 03 42 14     add     r8,r3,r8
  258:  7c 69 1b 78     mr      r9,r3
  25c:  7f 88 30 40     cmplw   cr7,r8,r6
  260:  3d 29 40 00     addis   r9,r9,16384
  264:  55 29 00 26     rlwinm  r9,r9,0,0,19
  268:  61 29 01 25     ori     r9,r9,293
  26c:  91 2c 00 00     stw     r9,0(r12)
  270:  7d 4a 5b 78     or      r10,r10,r11
  274:  40 9d 00 08     ble     cr7,27c <__do_patch_instructions+0x50>
  278:  7c c8 33 78     mr      r8,r6
  27c:  7f 83 40 40     cmplw   cr7,r3,r8
  280:  40 9c 01 2c     bge     cr7,3ac <__do_patch_instructions+0x180>
  284:  7c 69 18 f8     not     r9,r3
  288:  7d 28 4a 14     add     r9,r8,r9
  28c:  55 29 f7 fe     rlwinm  r9,r9,30,31,31
  290:  7c e3 50 50     subf    r7,r3,r10
  294:  80 a4 00 00     lwz     r5,0(r4)
  298:  90 aa 00 00     stw     r5,0(r10)
  29c:  7c 00 50 6c     dcbst   0,r10
  2a0:  7c 00 04 ac     hwsync
  2a4:  7c 00 1f ac     icbi    0,r3
  2a8:  7c 00 04 ac     hwsync
  2ac:  4c 00 01 2c     isync
  2b0:  38 63 00 04     addi    r3,r3,4
  2b4:  40 82 00 08     bne     2bc <__do_patch_instructions+0x90>
  2b8:  38 84 00 04     addi    r4,r4,4
  2bc:  7f 83 40 40     cmplw   cr7,r3,r8
  2c0:  40 9c 00 a4     bge     cr7,364 <__do_patch_instructions+0x138>
  2c4:  2f 89 00 00     cmpwi   cr7,r9,0
  2c8:  41 9e 00 38     beq     cr7,300 <__do_patch_instructions+0xd4>
  2cc:  7d 23 3a 14     add     r9,r3,r7
  2d0:  81 44 00 00     lwz     r10,0(r4)
  2d4:  91 49 00 00     stw     r10,0(r9)
  2d8:  7c 00 48 6c     dcbst   0,r9
  2dc:  7c 00 04 ac     hwsync
  2e0:  7c 00 1f ac     icbi    0,r3
  2e4:  7c 00 04 ac     hwsync
  2e8:  4c 00 01 2c     isync
  2ec:  38 63 00 04     addi    r3,r3,4
  2f0:  40 82 00 08     bne     2f8 <__do_patch_instructions+0xcc>
  2f4:  38 84 00 04     addi    r4,r4,4
  2f8:  7f 83 40 40     cmplw   cr7,r3,r8
  2fc:  40 9c 00 68     bge     cr7,364 <__do_patch_instructions+0x138>
  300:  7d 23 3a 14     add     r9,r3,r7
  304:  81 44 00 00     lwz     r10,0(r4)
  308:  91 49 00 00     stw     r10,0(r9)
  30c:  7c 00 48 6c     dcbst   0,r9
  310:  7c 00 04 ac     hwsync
  314:  7c 00 1f ac     icbi    0,r3
  318:  7c 00 04 ac     hwsync
  31c:  4c 00 01 2c     isync
  320:  38 63 00 04     addi    r3,r3,4
  324:  7c 69 1b 78     mr      r9,r3
  328:  40 82 00 08     bne     330 <__do_patch_instructions+0x104>
  32c:  38 84 00 04     addi    r4,r4,4
  330:  7d 49 3a 14     add     r10,r9,r7
  334:  80 a4 00 00     lwz     r5,0(r4)
  338:  90 aa 00 00     stw     r5,0(r10)
  33c:  7c 00 50 6c     dcbst   0,r10
  340:  7c 00 04 ac     hwsync
  344:  7c 00 4f ac     icbi    0,r9
  348:  7c 00 04 ac     hwsync
  34c:  4c 00 01 2c     isync
  350:  38 69 00 04     addi    r3,r9,4
  354:  7f 83 40 40     cmplw   cr7,r3,r8
  358:  40 82 00 08     bne     360 <__do_patch_instructions+0x134>
  35c:  38 84 00 04     addi    r4,r4,4
  360:  41 9c ff a0     blt     cr7,300 <__do_patch_instructions+0xd4>
  364:  90 0c 00 00     stw     r0,0(r12)
  368:  39 20 00 00     li      r9,0
  36c:  7c 00 5a 64     tlbie   r11,r0
  370:  7c 00 04 ac     hwsync
  374:  2f 89 00 00     cmpwi   cr7,r9,0
  378:  40 9e 00 2c     bne     cr7,3a4 <__do_patch_instructions+0x178>
  37c:  7f 86 18 40     cmplw   cr7,r6,r3
  380:  41 9d fe cc     bgt     cr7,24c <__do_patch_instructions+0x20>
  384:  38 60 00 00     li      r3,0
  388:  4e 80 00 20     blr
  38c:  90 0c 00 00     stw     r0,0(r12)
  390:  39 20 ff ff     li      r9,-1
  394:  7c 00 5a 64     tlbie   r11,r0
  398:  7c 00 04 ac     hwsync
  39c:  2f 89 00 00     cmpwi   cr7,r9,0
  3a0:  41 9e ff dc     beq     cr7,37c <__do_patch_instructions+0x150>
  3a4:  38 60 ff ff     li      r3,-1
  3a8:  4e 80 00 20     blr
  3ac:  39 20 00 00     li      r9,0
  3b0:  91 2c 00 00     stw     r9,0(r12)
  3b4:  7c 00 5a 64     tlbie   r11,r0
  3b8:  7c 00 04 ac     hwsync
  3bc:  4b ff ff c0     b       37c <__do_patch_instructions+0x150>

000003e8 <patch_instruction>:
  3e8:  94 21 ff e0     stwu    r1,-32(r1)
  3ec:  90 81 00 08     stw     r4,8(r1)
  3f0:  48 00 00 40     b       430 <patch_instruction+0x48>
  3f4:  7c 08 02 a6     mflr    r0
  3f8:  90 01 00 24     stw     r0,36(r1)
  3fc:  93 e1 00 1c     stw     r31,28(r1)
  400:  7f e0 00 a6     mfmsr   r31
  404:  7c 51 13 a6     mtspr   81,r2
  408:  38 c0 00 04     li      r6,4
  40c:  38 81 00 08     addi    r4,r1,8
  410:  38 a0 00 00     li      r5,0
  414:  4b ff fe 19     bl      22c <__do_patch_instructions>
  418:  7f e0 01 24     mtmsr   r31
  41c:  80 01 00 24     lwz     r0,36(r1)
  420:  83 e1 00 1c     lwz     r31,28(r1)
  424:  7c 08 03 a6     mtlr    r0
  428:  38 21 00 20     addi    r1,r1,32
  42c:  4e 80 00 20     blr

  430:  81 21 00 08     lwz     r9,8(r1)
  434:  91 23 00 00     stw     r9,0(r3)
  438:  7c 00 18 6c     dcbst   0,r3
  43c:  7c 00 04 ac     hwsync
  440:  7c 00 1f ac     icbi    0,r3
  444:  7c 00 04 ac     hwsync
  448:  4c 00 01 2c     isync
  44c:  38 60 00 00     li      r3,0
  450:  4b ff ff d8     b       428 <patch_instruction+0x40>
  454:  38 60 ff ff     li      r3,-1
  458:  4b ff ff d0     b       428 <patch_instruction+0x40>

Christophe

Reply via email to