[PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.
Danny Tsen
dtsen at linux.ibm.com
Wed Aug 30 23:50:47 AEST 2023
Hi Michael,
I just submitted the v2 patch.
Thanks.
-Danny
On 8/29/23 11:37 PM, Michael Ellerman wrote:
> Danny Tsen <dtsen at linux.ibm.com> writes:
>> Improve AES/XTS performance of 6-way unrolling for PowerPC up
>> to 17% with tcrypt. This is done by using one instruction,
>> vpermxor, to replace xor and vsldoi.
>>
>> This patch has been tested with the kernel crypto module tcrypt.ko and
>> has passed the selftest. The patch is also tested with
>> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.
>>
>> Signed-off-by: Danny Tsen <dtsen at linux.ibm.com>
>> ---
>> drivers/crypto/vmx/aesp8-ppc.pl | 141 +++++++++++++++++++++-----------
>> 1 file changed, 92 insertions(+), 49 deletions(-)
> That's CRYPTOGAMS code, and is so far largely unchanged from the
> original. I see you've sent the same change to openssl, but it's not
> merged yet. Please document that in the change log, we want to keep the
> code in sync as much as possible, and document any divergences.
>
> cheers
>
>> diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
>> index 50a0a18f35da..f729589d792e 100644
>> --- a/drivers/crypto/vmx/aesp8-ppc.pl
>> +++ b/drivers/crypto/vmx/aesp8-ppc.pl
>> @@ -132,11 +132,12 @@ rcon:
>> .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
>> .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
>> .long 0,0,0,0 ?asis
>> +.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
>> Lconsts:
>> mflr r0
>> bcl 20,31,\$+4
>> mflr $ptr #vvvvv "distance between . and rcon
>> - addi $ptr,$ptr,-0x48
>> + addi $ptr,$ptr,-0x58
>> mtlr r0
>> blr
>> .long 0
>> @@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
>> li $x70,0x70
>> mtspr 256,r0
>>
>> + xxlor 2, 32+$eighty7, 32+$eighty7
>> + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
>> + xxlor 1, 32+$eighty7, 32+$eighty7
>> +
>> + # Load XOR Lconsts.
>> + mr $x70, r6
>> + bl Lconsts
>> + lxvw4x 0, $x40, r6 # load XOR contents
>> + mr r6, $x70
>> + li $x70,0x70
>> +
>> subi $rounds,$rounds,3 # -4 in total
>>
>> lvx $rndkey0,$x00,$key1 # load key schedule
>> @@ -2537,69 +2549,77 @@ Load_xts_enc_key:
>> ?vperm v31,v31,$twk5,$keyperm
>> lvx v25,$x10,$key_ # pre-load round[2]
>>
>> + # Switch to use the following codes with 0x010101..87 to generate tweak.
>> + # eighty7 = 0x010101..87
>> + # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits
>> + # vand tmp, tmp, eighty7 # last byte with carry
>> + # vaddubm tweak, tweak, tweak # left shift 1 bit (x2)
>> + # xxlor vsx, 0, 0
>> + # vpermxor tweak, tweak, tmp, vsx
>> +
>> vperm $in0,$inout,$inptail,$inpperm
>> subi $inp,$inp,31 # undo "caller"
>> vxor $twk0,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vand $tmp,$tmp,$eighty7
>> vxor $out0,$in0,$twk0
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in1, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in1
>>
>> lvx_u $in1,$x10,$inp
>> vxor $twk1,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in1,$in1,$in1,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out1,$in1,$twk1
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in2, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in2
>>
>> lvx_u $in2,$x20,$inp
>> andi. $taillen,$len,15
>> vxor $twk2,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in2,$in2,$in2,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out2,$in2,$twk2
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in3, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in3
>>
>> lvx_u $in3,$x30,$inp
>> sub $len,$len,$taillen
>> vxor $twk3,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in3,$in3,$in3,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out3,$in3,$twk3
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in4, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in4
>>
>> lvx_u $in4,$x40,$inp
>> subi $len,$len,0x60
>> vxor $twk4,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in4,$in4,$in4,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out4,$in4,$twk4
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in5, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in5
>>
>> lvx_u $in5,$x50,$inp
>> addi $inp,$inp,0x60
>> vxor $twk5,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in5,$in5,$in5,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out5,$in5,$twk5
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in0, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in0
>>
>> vxor v31,v31,$rndkey0
>> mtctr $rounds
>> @@ -2625,6 +2645,8 @@ Loop_xts_enc6x:
>> lvx v25,$x10,$key_ # round[4]
>> bdnz Loop_xts_enc6x
>>
>> + xxlor 32+$eighty7, 1, 1 # 0x010101..87
>> +
>> subic $len,$len,96 # $len-=96
>> vxor $in0,$twk0,v31 # xor with last round key
>> vcipher $out0,$out0,v24
>> @@ -2634,7 +2656,6 @@ Loop_xts_enc6x:
>> vaddubm $tweak,$tweak,$tweak
>> vcipher $out2,$out2,v24
>> vcipher $out3,$out3,v24
>> - vsldoi $tmp,$tmp,$tmp,15
>> vcipher $out4,$out4,v24
>> vcipher $out5,$out5,v24
>>
>> @@ -2642,7 +2663,8 @@ Loop_xts_enc6x:
>> vand $tmp,$tmp,$eighty7
>> vcipher $out0,$out0,v25
>> vcipher $out1,$out1,v25
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in1, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in1
>> vcipher $out2,$out2,v25
>> vcipher $out3,$out3,v25
>> vxor $in1,$twk1,v31
>> @@ -2653,13 +2675,13 @@ Loop_xts_enc6x:
>>
>> and r0,r0,$len
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vcipher $out0,$out0,v26
>> vcipher $out1,$out1,v26
>> vand $tmp,$tmp,$eighty7
>> vcipher $out2,$out2,v26
>> vcipher $out3,$out3,v26
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in2, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in2
>> vcipher $out4,$out4,v26
>> vcipher $out5,$out5,v26
>>
>> @@ -2673,7 +2695,6 @@ Loop_xts_enc6x:
>> vaddubm $tweak,$tweak,$tweak
>> vcipher $out0,$out0,v27
>> vcipher $out1,$out1,v27
>> - vsldoi $tmp,$tmp,$tmp,15
>> vcipher $out2,$out2,v27
>> vcipher $out3,$out3,v27
>> vand $tmp,$tmp,$eighty7
>> @@ -2681,7 +2702,8 @@ Loop_xts_enc6x:
>> vcipher $out5,$out5,v27
>>
>> addi $key_,$sp,$FRAME+15 # rewind $key_
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in3, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in3
>> vcipher $out0,$out0,v28
>> vcipher $out1,$out1,v28
>> vxor $in3,$twk3,v31
>> @@ -2690,7 +2712,6 @@ Loop_xts_enc6x:
>> vcipher $out2,$out2,v28
>> vcipher $out3,$out3,v28
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vcipher $out4,$out4,v28
>> vcipher $out5,$out5,v28
>> lvx v24,$x00,$key_ # re-pre-load round[1]
>> @@ -2698,7 +2719,8 @@ Loop_xts_enc6x:
>>
>> vcipher $out0,$out0,v29
>> vcipher $out1,$out1,v29
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in4, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in4
>> vcipher $out2,$out2,v29
>> vcipher $out3,$out3,v29
>> vxor $in4,$twk4,v31
>> @@ -2708,14 +2730,14 @@ Loop_xts_enc6x:
>> vcipher $out5,$out5,v29
>> lvx v25,$x10,$key_ # re-pre-load round[2]
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>>
>> vcipher $out0,$out0,v30
>> vcipher $out1,$out1,v30
>> vand $tmp,$tmp,$eighty7
>> vcipher $out2,$out2,v30
>> vcipher $out3,$out3,v30
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in5, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in5
>> vcipher $out4,$out4,v30
>> vcipher $out5,$out5,v30
>> vxor $in5,$twk5,v31
>> @@ -2725,7 +2747,6 @@ Loop_xts_enc6x:
>> vcipherlast $out0,$out0,$in0
>> lvx_u $in0,$x00,$inp # load next input block
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vcipherlast $out1,$out1,$in1
>> lvx_u $in1,$x10,$inp
>> vcipherlast $out2,$out2,$in2
>> @@ -2738,7 +2759,10 @@ Loop_xts_enc6x:
>> vcipherlast $out4,$out4,$in4
>> le?vperm $in2,$in2,$in2,$leperm
>> lvx_u $in4,$x40,$inp
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 10, 32+$in0, 32+$in0
>> + xxlor 32+$in0, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in0
>> + xxlor 32+$in0, 10, 10
>> vcipherlast $tmp,$out5,$in5 # last block might be needed
>> # in stealing mode
>> le?vperm $in3,$in3,$in3,$leperm
>> @@ -2771,6 +2795,8 @@ Loop_xts_enc6x:
>> mtctr $rounds
>> beq Loop_xts_enc6x # did $len-=96 borrow?
>>
>> + xxlor 32+$eighty7, 2, 2 # 0x010101..87
>> +
>> addic. $len,$len,0x60
>> beq Lxts_enc6x_zero
>> cmpwi $len,0x20
>> @@ -3147,6 +3173,17 @@ _aesp8_xts_decrypt6x:
>> li $x70,0x70
>> mtspr 256,r0
>>
>> + xxlor 2, 32+$eighty7, 32+$eighty7
>> + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
>> + xxlor 1, 32+$eighty7, 32+$eighty7
>> +
>> + # Load XOR Lconsts.
>> + mr $x70, r6
>> + bl Lconsts
>> + lxvw4x 0, $x40, r6 # load XOR contents
>> + mr r6, $x70
>> + li $x70,0x70
>> +
>> subi $rounds,$rounds,3 # -4 in total
>>
>> lvx $rndkey0,$x00,$key1 # load key schedule
>> @@ -3194,64 +3231,64 @@ Load_xts_dec_key:
>> vxor $twk0,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vand $tmp,$tmp,$eighty7
>> vxor $out0,$in0,$twk0
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in1, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in1
>>
>> lvx_u $in1,$x10,$inp
>> vxor $twk1,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in1,$in1,$in1,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out1,$in1,$twk1
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in2, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in2
>>
>> lvx_u $in2,$x20,$inp
>> andi. $taillen,$len,15
>> vxor $twk2,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in2,$in2,$in2,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out2,$in2,$twk2
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in3, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in3
>>
>> lvx_u $in3,$x30,$inp
>> sub $len,$len,$taillen
>> vxor $twk3,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in3,$in3,$in3,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out3,$in3,$twk3
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in4, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in4
>>
>> lvx_u $in4,$x40,$inp
>> subi $len,$len,0x60
>> vxor $twk4,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in4,$in4,$in4,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out4,$in4,$twk4
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in5, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in5
>>
>> lvx_u $in5,$x50,$inp
>> addi $inp,$inp,0x60
>> vxor $twk5,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in5,$in5,$in5,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out5,$in5,$twk5
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in0, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in0
>>
>> vxor v31,v31,$rndkey0
>> mtctr $rounds
>> @@ -3277,6 +3314,8 @@ Loop_xts_dec6x:
>> lvx v25,$x10,$key_ # round[4]
>> bdnz Loop_xts_dec6x
>>
>> + xxlor 32+$eighty7, 1, 1 # 0x010101..87
>> +
>> subic $len,$len,96 # $len-=96
>> vxor $in0,$twk0,v31 # xor with last round key
>> vncipher $out0,$out0,v24
>> @@ -3286,7 +3325,6 @@ Loop_xts_dec6x:
>> vaddubm $tweak,$tweak,$tweak
>> vncipher $out2,$out2,v24
>> vncipher $out3,$out3,v24
>> - vsldoi $tmp,$tmp,$tmp,15
>> vncipher $out4,$out4,v24
>> vncipher $out5,$out5,v24
>>
>> @@ -3294,7 +3332,8 @@ Loop_xts_dec6x:
>> vand $tmp,$tmp,$eighty7
>> vncipher $out0,$out0,v25
>> vncipher $out1,$out1,v25
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in1, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in1
>> vncipher $out2,$out2,v25
>> vncipher $out3,$out3,v25
>> vxor $in1,$twk1,v31
>> @@ -3305,13 +3344,13 @@ Loop_xts_dec6x:
>>
>> and r0,r0,$len
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vncipher $out0,$out0,v26
>> vncipher $out1,$out1,v26
>> vand $tmp,$tmp,$eighty7
>> vncipher $out2,$out2,v26
>> vncipher $out3,$out3,v26
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in2, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in2
>> vncipher $out4,$out4,v26
>> vncipher $out5,$out5,v26
>>
>> @@ -3325,7 +3364,6 @@ Loop_xts_dec6x:
>> vaddubm $tweak,$tweak,$tweak
>> vncipher $out0,$out0,v27
>> vncipher $out1,$out1,v27
>> - vsldoi $tmp,$tmp,$tmp,15
>> vncipher $out2,$out2,v27
>> vncipher $out3,$out3,v27
>> vand $tmp,$tmp,$eighty7
>> @@ -3333,7 +3371,8 @@ Loop_xts_dec6x:
>> vncipher $out5,$out5,v27
>>
>> addi $key_,$sp,$FRAME+15 # rewind $key_
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in3, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in3
>> vncipher $out0,$out0,v28
>> vncipher $out1,$out1,v28
>> vxor $in3,$twk3,v31
>> @@ -3342,7 +3381,6 @@ Loop_xts_dec6x:
>> vncipher $out2,$out2,v28
>> vncipher $out3,$out3,v28
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vncipher $out4,$out4,v28
>> vncipher $out5,$out5,v28
>> lvx v24,$x00,$key_ # re-pre-load round[1]
>> @@ -3350,7 +3388,8 @@ Loop_xts_dec6x:
>>
>> vncipher $out0,$out0,v29
>> vncipher $out1,$out1,v29
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in4, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in4
>> vncipher $out2,$out2,v29
>> vncipher $out3,$out3,v29
>> vxor $in4,$twk4,v31
>> @@ -3360,14 +3399,14 @@ Loop_xts_dec6x:
>> vncipher $out5,$out5,v29
>> lvx v25,$x10,$key_ # re-pre-load round[2]
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>>
>> vncipher $out0,$out0,v30
>> vncipher $out1,$out1,v30
>> vand $tmp,$tmp,$eighty7
>> vncipher $out2,$out2,v30
>> vncipher $out3,$out3,v30
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in5, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in5
>> vncipher $out4,$out4,v30
>> vncipher $out5,$out5,v30
>> vxor $in5,$twk5,v31
>> @@ -3377,7 +3416,6 @@ Loop_xts_dec6x:
>> vncipherlast $out0,$out0,$in0
>> lvx_u $in0,$x00,$inp # load next input block
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vncipherlast $out1,$out1,$in1
>> lvx_u $in1,$x10,$inp
>> vncipherlast $out2,$out2,$in2
>> @@ -3390,7 +3428,10 @@ Loop_xts_dec6x:
>> vncipherlast $out4,$out4,$in4
>> le?vperm $in2,$in2,$in2,$leperm
>> lvx_u $in4,$x40,$inp
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 10, 32+$in0, 32+$in0
>> + xxlor 32+$in0, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in0
>> + xxlor 32+$in0, 10, 10
>> vncipherlast $out5,$out5,$in5
>> le?vperm $in3,$in3,$in3,$leperm
>> lvx_u $in5,$x50,$inp
>> @@ -3421,6 +3462,8 @@ Loop_xts_dec6x:
>> mtctr $rounds
>> beq Loop_xts_dec6x # did $len-=96 borrow?
>>
>> + xxlor 32+$eighty7, 2, 2 # 0x010101..87
>> +
>> addic. $len,$len,0x60
>> beq Lxts_dec6x_zero
>> cmpwi $len,0x20
>> --
>> 2.31.1
More information about the Linuxppc-dev
mailing list