[PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()

Christophe Leroy christophe.leroy at csgroup.eu
Sat Mar 11 18:42:03 AEDT 2023



Le 10/03/2023 à 19:26, Christophe Leroy a écrit :
> 
> 
> Le 09/03/2023 à 19:02, Hari Bathini a écrit :
>> patch_instruction() entails setting up pte, patching the instruction,
>> clearing the pte and flushing the tlb. If multiple instructions need
>> to be patched, every instruction would have to go through the above
>> drill unnecessarily. Instead, introduce function patch_instructions()
>> that patches multiple instructions at one go while setting up the pte,
>> clearing the pte and flushing the tlb only once per page range of
>> instructions. Observed ~5X improvement in speed of execution using
>> patch_instructions() over patch_instructions(), when more instructions
>> are to be patched.
> 
> I get a 13% degradation on the time needed to activate ftrace on a 
> powerpc 8xx.
> 
> Before your patch, activation ftrace takes 550k timebase ticks. After 
> your patch it takes 620k timebase ticks.
> 

More details about the problem:

Before your patch, patch_instruction() is a simple, stackless function 
(Note that the first branch is noped out after startup).

00000254 <patch_instruction>:
  254:	48 00 00 6c 	b       2c0 <patch_instruction+0x6c>
  258:	7c e0 00 a6 	mfmsr   r7
  25c:	7c 51 13 a6 	mtspr   81,r2
  260:	3d 40 00 00 	lis     r10,0
			262: R_PPC_ADDR16_HA	.data
  264:	39 4a 00 00 	addi    r10,r10,0
			266: R_PPC_ADDR16_LO	.data
  268:	7c 69 1b 78 	mr      r9,r3
  26c:	3d 29 40 00 	addis   r9,r9,16384
  270:	81 0a 00 08 	lwz     r8,8(r10)
  274:	55 29 00 26 	rlwinm  r9,r9,0,0,19
  278:	81 4a 00 04 	lwz     r10,4(r10)
  27c:	61 29 01 25 	ori     r9,r9,293
  280:	91 28 00 00 	stw     r9,0(r8)
  284:	55 49 00 26 	rlwinm  r9,r10,0,0,19
  288:	50 6a 05 3e 	rlwimi  r10,r3,0,20,31
  28c:	90 8a 00 00 	stw     r4,0(r10)
  290:	7c 00 50 6c 	dcbst   0,r10
  294:	7c 00 04 ac 	hwsync
  298:	7c 00 1f ac 	icbi    0,r3
  29c:	7c 00 04 ac 	hwsync
  2a0:	4c 00 01 2c 	isync
  2a4:	38 60 00 00 	li      r3,0
  2a8:	39 40 00 00 	li      r10,0
  2ac:	91 48 00 00 	stw     r10,0(r8)
  2b0:	7c 00 4a 64 	tlbie   r9,r0
  2b4:	7c 00 04 ac 	hwsync
  2b8:	7c e0 01 24 	mtmsr   r7
  2bc:	4e 80 00 20 	blr

  2c0:	90 83 00 00 	stw     r4,0(r3)
  2c4:	7c 00 18 6c 	dcbst   0,r3
  2c8:	7c 00 04 ac 	hwsync
  2cc:	7c 00 1f ac 	icbi    0,r3
  2d0:	7c 00 04 ac 	hwsync
  2d4:	4c 00 01 2c 	isync
  2d8:	38 60 00 00 	li      r3,0
  2dc:	4e 80 00 20 	blr
  2e0:	38 60 ff ff 	li      r3,-1
  2e4:	4b ff ff c4 	b       2a8 <patch_instruction+0x54>
  2e8:	38 60 ff ff 	li      r3,-1
  2ec:	4e 80 00 20 	blr


Once your patch is there, patch_instruction() becomes a function that 
has to step up a stack in order to call __do_patch_instructions().
And __do_patch_instructions() is quite a big function.

0000022c <__do_patch_instructions>:
  22c:	3d 20 00 00 	lis     r9,0
			22e: R_PPC_ADDR16_HA	.data
  230:	39 29 00 00 	addi    r9,r9,0
			232: R_PPC_ADDR16_LO	.data
  234:	81 69 00 04 	lwz     r11,4(r9)
  238:	2c 05 00 00 	cmpwi   r5,0
  23c:	81 89 00 08 	lwz     r12,8(r9)
  240:	7c c3 32 14 	add     r6,r3,r6
  244:	55 6b 00 26 	rlwinm  r11,r11,0,0,19
  248:	38 00 00 00 	li      r0,0
  24c:	54 6a 05 3e 	clrlwi  r10,r3,20
  250:	21 0a 10 00 	subfic  r8,r10,4096
  254:	7d 03 42 14 	add     r8,r3,r8
  258:	7c 69 1b 78 	mr      r9,r3
  25c:	7f 88 30 40 	cmplw   cr7,r8,r6
  260:	3d 29 40 00 	addis   r9,r9,16384
  264:	55 29 00 26 	rlwinm  r9,r9,0,0,19
  268:	61 29 01 25 	ori     r9,r9,293
  26c:	91 2c 00 00 	stw     r9,0(r12)
  270:	7d 4a 5b 78 	or      r10,r10,r11
  274:	40 9d 00 08 	ble     cr7,27c <__do_patch_instructions+0x50>
  278:	7c c8 33 78 	mr      r8,r6
  27c:	7f 83 40 40 	cmplw   cr7,r3,r8
  280:	40 9c 01 2c 	bge     cr7,3ac <__do_patch_instructions+0x180>
  284:	7c 69 18 f8 	not     r9,r3
  288:	7d 28 4a 14 	add     r9,r8,r9
  28c:	55 29 f7 fe 	rlwinm  r9,r9,30,31,31
  290:	7c e3 50 50 	subf    r7,r3,r10
  294:	80 a4 00 00 	lwz     r5,0(r4)
  298:	90 aa 00 00 	stw     r5,0(r10)
  29c:	7c 00 50 6c 	dcbst   0,r10
  2a0:	7c 00 04 ac 	hwsync
  2a4:	7c 00 1f ac 	icbi    0,r3
  2a8:	7c 00 04 ac 	hwsync
  2ac:	4c 00 01 2c 	isync
  2b0:	38 63 00 04 	addi    r3,r3,4
  2b4:	40 82 00 08 	bne     2bc <__do_patch_instructions+0x90>
  2b8:	38 84 00 04 	addi    r4,r4,4
  2bc:	7f 83 40 40 	cmplw   cr7,r3,r8
  2c0:	40 9c 00 a4 	bge     cr7,364 <__do_patch_instructions+0x138>
  2c4:	2f 89 00 00 	cmpwi   cr7,r9,0
  2c8:	41 9e 00 38 	beq     cr7,300 <__do_patch_instructions+0xd4>
  2cc:	7d 23 3a 14 	add     r9,r3,r7
  2d0:	81 44 00 00 	lwz     r10,0(r4)
  2d4:	91 49 00 00 	stw     r10,0(r9)
  2d8:	7c 00 48 6c 	dcbst   0,r9
  2dc:	7c 00 04 ac 	hwsync
  2e0:	7c 00 1f ac 	icbi    0,r3
  2e4:	7c 00 04 ac 	hwsync
  2e8:	4c 00 01 2c 	isync
  2ec:	38 63 00 04 	addi    r3,r3,4
  2f0:	40 82 00 08 	bne     2f8 <__do_patch_instructions+0xcc>
  2f4:	38 84 00 04 	addi    r4,r4,4
  2f8:	7f 83 40 40 	cmplw   cr7,r3,r8
  2fc:	40 9c 00 68 	bge     cr7,364 <__do_patch_instructions+0x138>
  300:	7d 23 3a 14 	add     r9,r3,r7
  304:	81 44 00 00 	lwz     r10,0(r4)
  308:	91 49 00 00 	stw     r10,0(r9)
  30c:	7c 00 48 6c 	dcbst   0,r9
  310:	7c 00 04 ac 	hwsync
  314:	7c 00 1f ac 	icbi    0,r3
  318:	7c 00 04 ac 	hwsync
  31c:	4c 00 01 2c 	isync
  320:	38 63 00 04 	addi    r3,r3,4
  324:	7c 69 1b 78 	mr      r9,r3
  328:	40 82 00 08 	bne     330 <__do_patch_instructions+0x104>
  32c:	38 84 00 04 	addi    r4,r4,4
  330:	7d 49 3a 14 	add     r10,r9,r7
  334:	80 a4 00 00 	lwz     r5,0(r4)
  338:	90 aa 00 00 	stw     r5,0(r10)
  33c:	7c 00 50 6c 	dcbst   0,r10
  340:	7c 00 04 ac 	hwsync
  344:	7c 00 4f ac 	icbi    0,r9
  348:	7c 00 04 ac 	hwsync
  34c:	4c 00 01 2c 	isync
  350:	38 69 00 04 	addi    r3,r9,4
  354:	7f 83 40 40 	cmplw   cr7,r3,r8
  358:	40 82 00 08 	bne     360 <__do_patch_instructions+0x134>
  35c:	38 84 00 04 	addi    r4,r4,4
  360:	41 9c ff a0 	blt     cr7,300 <__do_patch_instructions+0xd4>
  364:	90 0c 00 00 	stw     r0,0(r12)
  368:	39 20 00 00 	li      r9,0
  36c:	7c 00 5a 64 	tlbie   r11,r0
  370:	7c 00 04 ac 	hwsync
  374:	2f 89 00 00 	cmpwi   cr7,r9,0
  378:	40 9e 00 2c 	bne     cr7,3a4 <__do_patch_instructions+0x178>
  37c:	7f 86 18 40 	cmplw   cr7,r6,r3
  380:	41 9d fe cc 	bgt     cr7,24c <__do_patch_instructions+0x20>
  384:	38 60 00 00 	li      r3,0
  388:	4e 80 00 20 	blr
  38c:	90 0c 00 00 	stw     r0,0(r12)
  390:	39 20 ff ff 	li      r9,-1
  394:	7c 00 5a 64 	tlbie   r11,r0
  398:	7c 00 04 ac 	hwsync
  39c:	2f 89 00 00 	cmpwi   cr7,r9,0
  3a0:	41 9e ff dc 	beq     cr7,37c <__do_patch_instructions+0x150>
  3a4:	38 60 ff ff 	li      r3,-1
  3a8:	4e 80 00 20 	blr
  3ac:	39 20 00 00 	li      r9,0
  3b0:	91 2c 00 00 	stw     r9,0(r12)
  3b4:	7c 00 5a 64 	tlbie   r11,r0
  3b8:	7c 00 04 ac 	hwsync
  3bc:	4b ff ff c0 	b       37c <__do_patch_instructions+0x150>

000003e8 <patch_instruction>:
  3e8:	94 21 ff e0 	stwu    r1,-32(r1)
  3ec:	90 81 00 08 	stw     r4,8(r1)
  3f0:	48 00 00 40 	b       430 <patch_instruction+0x48>
  3f4:	7c 08 02 a6 	mflr    r0
  3f8:	90 01 00 24 	stw     r0,36(r1)
  3fc:	93 e1 00 1c 	stw     r31,28(r1)
  400:	7f e0 00 a6 	mfmsr   r31
  404:	7c 51 13 a6 	mtspr   81,r2
  408:	38 c0 00 04 	li      r6,4
  40c:	38 81 00 08 	addi    r4,r1,8
  410:	38 a0 00 00 	li      r5,0
  414:	4b ff fe 19 	bl      22c <__do_patch_instructions>
  418:	7f e0 01 24 	mtmsr   r31
  41c:	80 01 00 24 	lwz     r0,36(r1)
  420:	83 e1 00 1c 	lwz     r31,28(r1)
  424:	7c 08 03 a6 	mtlr    r0
  428:	38 21 00 20 	addi    r1,r1,32
  42c:	4e 80 00 20 	blr

  430:	81 21 00 08 	lwz     r9,8(r1)
  434:	91 23 00 00 	stw     r9,0(r3)
  438:	7c 00 18 6c 	dcbst   0,r3
  43c:	7c 00 04 ac 	hwsync
  440:	7c 00 1f ac 	icbi    0,r3
  444:	7c 00 04 ac 	hwsync
  448:	4c 00 01 2c 	isync
  44c:	38 60 00 00 	li      r3,0
  450:	4b ff ff d8 	b       428 <patch_instruction+0x40>
  454:	38 60 ff ff 	li      r3,-1
  458:	4b ff ff d0 	b       428 <patch_instruction+0x40>

Christophe


More information about the Linuxppc-dev mailing list