Danny Tsen <dt...@linux.ibm.com> writes:
> Improve AES/XTS performance of 6-way unrolling for PowerPC up
> to 17% with tcrypt.  This is done by using one instruction,
> vpermxor, to replace xor and vsldoi.
>
> This patch has been tested with the kernel crypto module tcrypt.ko and
> has passed the selftest.  The patch is also tested with
> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.
>
> Signed-off-by: Danny Tsen <dt...@linux.ibm.com>
> ---
>  drivers/crypto/vmx/aesp8-ppc.pl | 141 +++++++++++++++++++++-----------
>  1 file changed, 92 insertions(+), 49 deletions(-)

That's CRYPTOGAMS code, and is so far largely unchanged from the
original. I see you've sent the same change to openssl, but it's not
merged yet. Please document that in the change log, we want to keep the
code in sync as much as possible, and document any divergences.

cheers

> diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
> index 50a0a18f35da..f729589d792e 100644
> --- a/drivers/crypto/vmx/aesp8-ppc.pl
> +++ b/drivers/crypto/vmx/aesp8-ppc.pl
> @@ -132,11 +132,12 @@ rcon:
>  .long        0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
>  .long        0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
>  .long        0,0,0,0                                         ?asis
> +.long        0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
>  Lconsts:
>       mflr    r0
>       bcl     20,31,\$+4
>       mflr    $ptr     #vvvvv "distance between . and rcon
> -     addi    $ptr,$ptr,-0x48
> +     addi    $ptr,$ptr,-0x58
>       mtlr    r0
>       blr
>       .long   0
> @@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
>       li              $x70,0x70
>       mtspr           256,r0
>  
> +     xxlor           2, 32+$eighty7, 32+$eighty7
> +     vsldoi          $eighty7,$tmp,$eighty7,1        # 0x010101..87
> +     xxlor           1, 32+$eighty7, 32+$eighty7
> +
> +     # Load XOR Lconsts.
> +     mr              $x70, r6
> +     bl              Lconsts
> +     lxvw4x          0, $x40, r6             # load XOR contents
> +     mr              r6, $x70
> +     li              $x70,0x70
> +
>       subi            $rounds,$rounds,3       # -4 in total
>  
>       lvx             $rndkey0,$x00,$key1     # load key schedule
> @@ -2537,69 +2549,77 @@ Load_xts_enc_key:
>       ?vperm          v31,v31,$twk5,$keyperm
>       lvx             v25,$x10,$key_          # pre-load round[2]
>  
> +     # Switch to use the following codes with 0x010101..87 to generate tweak.
> +     #     eighty7 = 0x010101..87
> +     # vsrab         tmp, tweak, seven       # next tweak value, right shift 
> 7 bits
> +     # vand          tmp, tmp, eighty7       # last byte with carry
> +     # vaddubm       tweak, tweak, tweak     # left shift 1 bit (x2)
> +     # xxlor         vsx, 0, 0
> +     # vpermxor      tweak, tweak, tmp, vsx
> +
>        vperm          $in0,$inout,$inptail,$inpperm
>        subi           $inp,$inp,31            # undo "caller"
>       vxor            $twk0,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out0,$in0,$twk0
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in1, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in1
>  
>        lvx_u          $in1,$x10,$inp
>       vxor            $twk1,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>        le?vperm       $in1,$in1,$in1,$leperm
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out1,$in1,$twk1
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in2, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in2
>  
>        lvx_u          $in2,$x20,$inp
>        andi.          $taillen,$len,15
>       vxor            $twk2,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>        le?vperm       $in2,$in2,$in2,$leperm
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out2,$in2,$twk2
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in3, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in3
>  
>        lvx_u          $in3,$x30,$inp
>        sub            $len,$len,$taillen
>       vxor            $twk3,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>        le?vperm       $in3,$in3,$in3,$leperm
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out3,$in3,$twk3
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in4, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in4
>  
>        lvx_u          $in4,$x40,$inp
>        subi           $len,$len,0x60
>       vxor            $twk4,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>        le?vperm       $in4,$in4,$in4,$leperm
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out4,$in4,$twk4
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in5, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in5
>  
>        lvx_u          $in5,$x50,$inp
>        addi           $inp,$inp,0x60
>       vxor            $twk5,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>        le?vperm       $in5,$in5,$in5,$leperm
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out5,$in5,$twk5
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in0, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in0
>  
>       vxor            v31,v31,$rndkey0
>       mtctr           $rounds
> @@ -2625,6 +2645,8 @@ Loop_xts_enc6x:
>       lvx             v25,$x10,$key_          # round[4]
>       bdnz            Loop_xts_enc6x
>  
> +     xxlor           32+$eighty7, 1, 1       # 0x010101..87
> +
>       subic           $len,$len,96            # $len-=96
>        vxor           $in0,$twk0,v31          # xor with last round key
>       vcipher         $out0,$out0,v24
> @@ -2634,7 +2656,6 @@ Loop_xts_enc6x:
>        vaddubm        $tweak,$tweak,$tweak
>       vcipher         $out2,$out2,v24
>       vcipher         $out3,$out3,v24
> -      vsldoi         $tmp,$tmp,$tmp,15
>       vcipher         $out4,$out4,v24
>       vcipher         $out5,$out5,v24
>  
> @@ -2642,7 +2663,8 @@ Loop_xts_enc6x:
>        vand           $tmp,$tmp,$eighty7
>       vcipher         $out0,$out0,v25
>       vcipher         $out1,$out1,v25
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          32+$in1, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in1
>       vcipher         $out2,$out2,v25
>       vcipher         $out3,$out3,v25
>        vxor           $in1,$twk1,v31
> @@ -2653,13 +2675,13 @@ Loop_xts_enc6x:
>  
>       and             r0,r0,$len
>        vaddubm        $tweak,$tweak,$tweak
> -      vsldoi         $tmp,$tmp,$tmp,15
>       vcipher         $out0,$out0,v26
>       vcipher         $out1,$out1,v26
>        vand           $tmp,$tmp,$eighty7
>       vcipher         $out2,$out2,v26
>       vcipher         $out3,$out3,v26
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          32+$in2, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in2
>       vcipher         $out4,$out4,v26
>       vcipher         $out5,$out5,v26
>  
> @@ -2673,7 +2695,6 @@ Loop_xts_enc6x:
>        vaddubm        $tweak,$tweak,$tweak
>       vcipher         $out0,$out0,v27
>       vcipher         $out1,$out1,v27
> -      vsldoi         $tmp,$tmp,$tmp,15
>       vcipher         $out2,$out2,v27
>       vcipher         $out3,$out3,v27
>        vand           $tmp,$tmp,$eighty7
> @@ -2681,7 +2702,8 @@ Loop_xts_enc6x:
>       vcipher         $out5,$out5,v27
>  
>       addi            $key_,$sp,$FRAME+15     # rewind $key_
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          32+$in3, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in3
>       vcipher         $out0,$out0,v28
>       vcipher         $out1,$out1,v28
>        vxor           $in3,$twk3,v31
> @@ -2690,7 +2712,6 @@ Loop_xts_enc6x:
>       vcipher         $out2,$out2,v28
>       vcipher         $out3,$out3,v28
>        vaddubm        $tweak,$tweak,$tweak
> -      vsldoi         $tmp,$tmp,$tmp,15
>       vcipher         $out4,$out4,v28
>       vcipher         $out5,$out5,v28
>       lvx             v24,$x00,$key_          # re-pre-load round[1]
> @@ -2698,7 +2719,8 @@ Loop_xts_enc6x:
>  
>       vcipher         $out0,$out0,v29
>       vcipher         $out1,$out1,v29
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          32+$in4, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in4
>       vcipher         $out2,$out2,v29
>       vcipher         $out3,$out3,v29
>        vxor           $in4,$twk4,v31
> @@ -2708,14 +2730,14 @@ Loop_xts_enc6x:
>       vcipher         $out5,$out5,v29
>       lvx             v25,$x10,$key_          # re-pre-load round[2]
>        vaddubm        $tweak,$tweak,$tweak
> -      vsldoi         $tmp,$tmp,$tmp,15
>  
>       vcipher         $out0,$out0,v30
>       vcipher         $out1,$out1,v30
>        vand           $tmp,$tmp,$eighty7
>       vcipher         $out2,$out2,v30
>       vcipher         $out3,$out3,v30
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          32+$in5, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in5
>       vcipher         $out4,$out4,v30
>       vcipher         $out5,$out5,v30
>        vxor           $in5,$twk5,v31
> @@ -2725,7 +2747,6 @@ Loop_xts_enc6x:
>       vcipherlast     $out0,$out0,$in0
>        lvx_u          $in0,$x00,$inp          # load next input block
>        vaddubm        $tweak,$tweak,$tweak
> -      vsldoi         $tmp,$tmp,$tmp,15
>       vcipherlast     $out1,$out1,$in1
>        lvx_u          $in1,$x10,$inp
>       vcipherlast     $out2,$out2,$in2
> @@ -2738,7 +2759,10 @@ Loop_xts_enc6x:
>       vcipherlast     $out4,$out4,$in4
>        le?vperm       $in2,$in2,$in2,$leperm
>        lvx_u          $in4,$x40,$inp
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          10, 32+$in0, 32+$in0
> +      xxlor          32+$in0, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in0
> +      xxlor          32+$in0, 10, 10
>       vcipherlast     $tmp,$out5,$in5         # last block might be needed
>                                               # in stealing mode
>        le?vperm       $in3,$in3,$in3,$leperm
> @@ -2771,6 +2795,8 @@ Loop_xts_enc6x:
>       mtctr           $rounds
>       beq             Loop_xts_enc6x          # did $len-=96 borrow?
>  
> +     xxlor           32+$eighty7, 2, 2       # 0x010101..87
> +
>       addic.          $len,$len,0x60
>       beq             Lxts_enc6x_zero
>       cmpwi           $len,0x20
> @@ -3147,6 +3173,17 @@ _aesp8_xts_decrypt6x:
>       li              $x70,0x70
>       mtspr           256,r0
>  
> +     xxlor           2, 32+$eighty7, 32+$eighty7
> +     vsldoi          $eighty7,$tmp,$eighty7,1        # 0x010101..87
> +     xxlor           1, 32+$eighty7, 32+$eighty7
> +
> +     # Load XOR Lconsts.
> +     mr              $x70, r6
> +     bl              Lconsts
> +     lxvw4x          0, $x40, r6             # load XOR contents
> +     mr              r6, $x70
> +     li              $x70,0x70
> +
>       subi            $rounds,$rounds,3       # -4 in total
>  
>       lvx             $rndkey0,$x00,$key1     # load key schedule
> @@ -3194,64 +3231,64 @@ Load_xts_dec_key:
>       vxor            $twk0,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out0,$in0,$twk0
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in1, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in1
>  
>        lvx_u          $in1,$x10,$inp
>       vxor            $twk1,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>        le?vperm       $in1,$in1,$in1,$leperm
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out1,$in1,$twk1
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in2, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in2
>  
>        lvx_u          $in2,$x20,$inp
>        andi.          $taillen,$len,15
>       vxor            $twk2,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>        le?vperm       $in2,$in2,$in2,$leperm
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out2,$in2,$twk2
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in3, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in3
>  
>        lvx_u          $in3,$x30,$inp
>        sub            $len,$len,$taillen
>       vxor            $twk3,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>        le?vperm       $in3,$in3,$in3,$leperm
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out3,$in3,$twk3
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in4, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in4
>  
>        lvx_u          $in4,$x40,$inp
>        subi           $len,$len,0x60
>       vxor            $twk4,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>        le?vperm       $in4,$in4,$in4,$leperm
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out4,$in4,$twk4
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in5, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in5
>  
>        lvx_u          $in5,$x50,$inp
>        addi           $inp,$inp,0x60
>       vxor            $twk5,$tweak,$rndkey0
>       vsrab           $tmp,$tweak,$seven      # next tweak value
>       vaddubm         $tweak,$tweak,$tweak
> -     vsldoi          $tmp,$tmp,$tmp,15
>        le?vperm       $in5,$in5,$in5,$leperm
>       vand            $tmp,$tmp,$eighty7
>        vxor           $out5,$in5,$twk5
> -     vxor            $tweak,$tweak,$tmp
> +     xxlor           32+$in0, 0, 0
> +     vpermxor        $tweak, $tweak, $tmp, $in0
>  
>       vxor            v31,v31,$rndkey0
>       mtctr           $rounds
> @@ -3277,6 +3314,8 @@ Loop_xts_dec6x:
>       lvx             v25,$x10,$key_          # round[4]
>       bdnz            Loop_xts_dec6x
>  
> +     xxlor           32+$eighty7, 1, 1       # 0x010101..87
> +
>       subic           $len,$len,96            # $len-=96
>        vxor           $in0,$twk0,v31          # xor with last round key
>       vncipher        $out0,$out0,v24
> @@ -3286,7 +3325,6 @@ Loop_xts_dec6x:
>        vaddubm        $tweak,$tweak,$tweak
>       vncipher        $out2,$out2,v24
>       vncipher        $out3,$out3,v24
> -      vsldoi         $tmp,$tmp,$tmp,15
>       vncipher        $out4,$out4,v24
>       vncipher        $out5,$out5,v24
>  
> @@ -3294,7 +3332,8 @@ Loop_xts_dec6x:
>        vand           $tmp,$tmp,$eighty7
>       vncipher        $out0,$out0,v25
>       vncipher        $out1,$out1,v25
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          32+$in1, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in1
>       vncipher        $out2,$out2,v25
>       vncipher        $out3,$out3,v25
>        vxor           $in1,$twk1,v31
> @@ -3305,13 +3344,13 @@ Loop_xts_dec6x:
>  
>       and             r0,r0,$len
>        vaddubm        $tweak,$tweak,$tweak
> -      vsldoi         $tmp,$tmp,$tmp,15
>       vncipher        $out0,$out0,v26
>       vncipher        $out1,$out1,v26
>        vand           $tmp,$tmp,$eighty7
>       vncipher        $out2,$out2,v26
>       vncipher        $out3,$out3,v26
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          32+$in2, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in2
>       vncipher        $out4,$out4,v26
>       vncipher        $out5,$out5,v26
>  
> @@ -3325,7 +3364,6 @@ Loop_xts_dec6x:
>        vaddubm        $tweak,$tweak,$tweak
>       vncipher        $out0,$out0,v27
>       vncipher        $out1,$out1,v27
> -      vsldoi         $tmp,$tmp,$tmp,15
>       vncipher        $out2,$out2,v27
>       vncipher        $out3,$out3,v27
>        vand           $tmp,$tmp,$eighty7
> @@ -3333,7 +3371,8 @@ Loop_xts_dec6x:
>       vncipher        $out5,$out5,v27
>  
>       addi            $key_,$sp,$FRAME+15     # rewind $key_
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          32+$in3, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in3
>       vncipher        $out0,$out0,v28
>       vncipher        $out1,$out1,v28
>        vxor           $in3,$twk3,v31
> @@ -3342,7 +3381,6 @@ Loop_xts_dec6x:
>       vncipher        $out2,$out2,v28
>       vncipher        $out3,$out3,v28
>        vaddubm        $tweak,$tweak,$tweak
> -      vsldoi         $tmp,$tmp,$tmp,15
>       vncipher        $out4,$out4,v28
>       vncipher        $out5,$out5,v28
>       lvx             v24,$x00,$key_          # re-pre-load round[1]
> @@ -3350,7 +3388,8 @@ Loop_xts_dec6x:
>  
>       vncipher        $out0,$out0,v29
>       vncipher        $out1,$out1,v29
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          32+$in4, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in4
>       vncipher        $out2,$out2,v29
>       vncipher        $out3,$out3,v29
>        vxor           $in4,$twk4,v31
> @@ -3360,14 +3399,14 @@ Loop_xts_dec6x:
>       vncipher        $out5,$out5,v29
>       lvx             v25,$x10,$key_          # re-pre-load round[2]
>        vaddubm        $tweak,$tweak,$tweak
> -      vsldoi         $tmp,$tmp,$tmp,15
>  
>       vncipher        $out0,$out0,v30
>       vncipher        $out1,$out1,v30
>        vand           $tmp,$tmp,$eighty7
>       vncipher        $out2,$out2,v30
>       vncipher        $out3,$out3,v30
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          32+$in5, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in5
>       vncipher        $out4,$out4,v30
>       vncipher        $out5,$out5,v30
>        vxor           $in5,$twk5,v31
> @@ -3377,7 +3416,6 @@ Loop_xts_dec6x:
>       vncipherlast    $out0,$out0,$in0
>        lvx_u          $in0,$x00,$inp          # load next input block
>        vaddubm        $tweak,$tweak,$tweak
> -      vsldoi         $tmp,$tmp,$tmp,15
>       vncipherlast    $out1,$out1,$in1
>        lvx_u          $in1,$x10,$inp
>       vncipherlast    $out2,$out2,$in2
> @@ -3390,7 +3428,10 @@ Loop_xts_dec6x:
>       vncipherlast    $out4,$out4,$in4
>        le?vperm       $in2,$in2,$in2,$leperm
>        lvx_u          $in4,$x40,$inp
> -      vxor           $tweak,$tweak,$tmp
> +      xxlor          10, 32+$in0, 32+$in0
> +      xxlor          32+$in0, 0, 0
> +      vpermxor       $tweak, $tweak, $tmp, $in0
> +      xxlor          32+$in0, 10, 10
>       vncipherlast    $out5,$out5,$in5
>        le?vperm       $in3,$in3,$in3,$leperm
>        lvx_u          $in5,$x50,$inp
> @@ -3421,6 +3462,8 @@ Loop_xts_dec6x:
>       mtctr           $rounds
>       beq             Loop_xts_dec6x          # did $len-=96 borrow?
>  
> +     xxlor           32+$eighty7, 2, 2       # 0x010101..87
> +
>       addic.          $len,$len,0x60
>       beq             Lxts_dec6x_zero
>       cmpwi           $len,0x20
> -- 
> 2.31.1

Reply via email to