[PATCH] net: Remove branch in csum_shift()

Tue Mar 1 22:14:46 AEDT 2022

Le 01/03/2022 à 11:47, David Laight a écrit :
> From: Christophe Leroy
>> Sent: 01 March 2022 10:20
>>
>> Le 13/02/2022 à 18:47, David Laight a écrit :
>>> From: Segher Boessenkool
>>>> Sent: 13 February 2022 09:16
>>> ....
>>>>
>>>>> What happens on x86-64?
>>>>>
>>>>> Trying to do the same in the x86 ipcsum code tended to make the code worse.
>>>>> (Although that test is for an odd length fragment and can just be removed.)
>>>>
>>>> In an ideal world the compiler could choose the optimal code sequences
>>>> everywhere.  But that won't ever happen, the search space is way too
>>>> big.  So compilers just use heuristics, not exhaustive search like
>>>> superopt does.  There is a middle way of course, something with directed
>>>> searches, and maybe in a few decades systems will be fast enough.  Until
>>>> then we will very often see code that is 10% slower and 30% bigger than
>>>> necessary.  A single insn more than needed isn't so bad :-)
>>>
>>> But it can be a lot more than that.
>>>
>>>> Making things branch-free is very much worth it here though!
>>>
>>> I tried to find out where 'here' is.
>>>
>>> I can't get godbolt to generate anything like that object code
>>> for a call to csum_shift().
>>>
>>> I can't actually get it to issue a rotate (x86 of ppc).
>>>
>>> I think it is only a single instruction because the compiler
>>> has saved 'offset & 1' much earlier instead of doing testing
>>> 'offset & 1' just prior to the conditional.
>>> It certainly has a nasty habit of doing that pessimisation.
>>>
>>> So while it helps a specific call site it may be much
>>> worse in general.
>>>
>>
>> The main user of csum_shift() is csum_and_copy_to_iter().
>>
>> You clearly see the difference in one of the instances below extracted
>> from output of objdump -S lib/iov_iter.o:
>>
>>
>> Without the patch:
>>
>> 	sum = csum_shift(csstate->csum, csstate->off);
>>       21a8:	92 e1 00 4c 	stw     r23,76(r1)
>>       21ac:	7c 77 1b 78 	mr      r23,r3
>>       21b0:	93 01 00 50 	stw     r24,80(r1)
>>       21b4:	7c b8 2b 78 	mr      r24,r5
>>       21b8:	93 61 00 5c 	stw     r27,92(r1)
>>       21bc:	7c db 33 78 	mr      r27,r6
>>       21c0:	93 81 00 60 	stw     r28,96(r1)
>>       21c4:	81 05 00 04 	lwz     r8,4(r5)
>>       21c8:	83 85 00 00 	lwz     r28,0(r5)
>> }
>>
>> static __always_inline __wsum csum_shift(__wsum sum, int offset)
>> {
>> 	/* rotate sum to align it with a 16b boundary */
>> 	if (offset & 1)
>>       21cc:	71 09 00 01 	andi.   r9,r8,1		<== test oddity
>>       21d0:	41 a2 00 08 	beq     21d8		<== branch
>>    * @word: value to rotate
>>    * @shift: bits to roll
>>    */
>> static inline __u32 ror32(__u32 word, unsigned int shift)
>> {
>> 	return (word >> (shift & 31)) | (word << ((-shift) & 31));
>>       21d4:	57 9c c0 3e 	rotlwi  r28,r28,24	<== rotate
>>       21d8:	2b 8a 00 03 	cmplwi  cr7,r10,3
>>       21dc:	41 9e 01 ec 	beq     cr7,23c8 <csum_and_copy_to_iter+0x234>
>>
>>
>>
>>
>> With the patch:
>>
>> 	sum = csum_shift(csstate->csum, csstate->off);
>>       21a8:	92 c1 00 48 	stw     r22,72(r1)
>> 	if (unlikely(iov_iter_is_pipe(i)))
>>       21ac:	28 08 00 03 	cmplwi  r8,3
>>       21b0:	92 e1 00 4c 	stw     r23,76(r1)
>>       21b4:	7c 76 1b 78 	mr      r22,r3
>>       21b8:	93 41 00 58 	stw     r26,88(r1)
>>       21bc:	7c b7 2b 78 	mr      r23,r5
>>       21c0:	93 81 00 60 	stw     r28,96(r1)
>>       21c4:	7c da 33 78 	mr      r26,r6
>> 	sum = csum_shift(csstate->csum, csstate->off);
>>       21c8:	80 e5 00 04 	lwz     r7,4(r5)
>>    * @word: value to rotate
>>    * @shift: bits to roll
>>    */
>> static inline __u32 rol32(__u32 word, unsigned int shift)
>> {
>> 	return (word << (shift & 31)) | (word >> ((-shift) & 31));
>>       21cc:	81 25 00 00 	lwz     r9,0(r5)
>> }
>>
>> static __always_inline __wsum csum_shift(__wsum sum, int offset)
>> {
>> 	/* rotate sum to align it with a 16b boundary */
>> 	return (__force __wsum)rol32((__force u32)sum, (offset & 1) << 3);
>>       21d0:	54 ea 1f 38 	rlwinm  r10,r7,3,28,28
> 
> Right, this all depends on the rlwinm instruction.
> I had to look it up, that one shifts r7 (count) left 3 bits
> and then masks it with all the bits from 28 to 28 (in some counting scheme).

Yes it does (offset << 3) & 8 which is exactly the same as (offset & 1) << 3

> 
> Try the same code on x86.
> The mask and shift have to be separate instructions and it probably
> needs a register move (which might be a rename and free).

I see

> Whereas the current code generates a conditional move.
> (At least, the only rotates in that function are followed by a cmovne.)

ppc32 doesn't have that.
ppc64 have isel, but gcc doesn't seem to use it either here.

> 
> So x86 is almost certainly better with the current code.
> No idea about arm or anything else people might care about.

Looks like ARM also does better code with the generic implementation as 
it seems to have some looking like conditional instructions 'rorne' and 
'strne'.

static __always_inline __wsum csum_shift(__wsum sum, int offset)
{
	/* rotate sum to align it with a 16b boundary */
	if (offset & 1)
     1d28:	e2102001 	ands	r2, r0, #1
     1d2c:	e58d3004 	str	r3, [sp, #4]
  * @word: value to rotate
  * @shift: bits to roll
  */
static inline __u32 ror32(__u32 word, unsigned int shift)
{
	return (word >> (shift & 31)) | (word << ((-shift) & 31));
     1d30:	11a03463 	rorne	r3, r3, #8
     1d34:	158d3004 	strne	r3, [sp, #4]
	if (unlikely(iov_iter_is_pipe(i)))

Whereas with my patch we get

static inline __u32 rol32(__u32 word, unsigned int shift)
{
	return (word << (shift & 31)) | (word >> ((-shift) & 31));
     1984:	e8930006 	ldm	r3, {r1, r2}
}

static __always_inline __wsum csum_shift(__wsum sum, int offset)
{
	/* rotate sum to align it with a 16b boundary */
	return (__force __wsum)rol32((__force u32)sum, (offset & 1) << 3);
     1988:	e1a03182 	lsl	r3, r2, #3
     198c:	e2033008 	and	r3, r3, #8
     1990:	e2633020 	rsb	r3, r3, #32
     1994:	e1a01371 	ror	r1, r1, r3
     1998:	e58d1008 	str	r1, [sp, #8]

We get slightly better with a ror32 instead of rol32:

	return (word >> (shift & 31)) | (word << ((-shift) & 31));
     1984:	e8930006 	ldm	r3, {r1, r2}
}

static __always_inline __wsum csum_shift(__wsum sum, int offset)
{
	/* rotate sum to align it with a 16b boundary */
	return (__force __wsum)ror32((__force u32)sum, (offset & 1) << 3);
     1988:	e1a03182 	lsl	r3, r2, #3
     198c:	e2033008 	and	r3, r3, #8
     1990:	e1a01371 	ror	r1, r1, r3
     1994:	e58d1008 	str	r1, [sp, #8]

> 
> All the world isn't ppc.

Ok, so the solution would be to have an arch specific version of 
csum_shift() in the same principle as csum_add().

Christophe