dule aes[enc|dec] every cycle optimal interleave factor # equals to corresponding instructions latency. 8x is optimal for # * Bridge and "super-optimal" for other Intel CPUs... sub aesni_generate2 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-1] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt2,\@abi-omnipotent .align 16 _aesni_${dir}rypt2: .cfi_startproc $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 $movkey 32($key),$rndkey0 lea 32($key,$rounds),$key neg %rax # $rounds add \$16,%rax .L${dir}_loop2: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop2 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 ret .cfi_endproc .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 ___ } sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-2] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt3,\@abi-omnipotent .align 16 _aesni_${dir}rypt3: .cfi_startproc $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 xorps $rndkey0,$inout2 $movkey 32($key),$rndkey0 lea 32($key,$rounds),$key neg %rax # $rounds add \$16,%rax .L${dir}_loop3: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 aes${dir} $rndkey0,$inout2 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop3 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 aes${dir}last $rndkey0,$inout2 ret .cfi_endproc .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 ___ } # 4x interleave is implemented to improve small block performance, # most notably [and naturally] 4 block by ~30%. One can argue that one # should have implemented 5x as well, but improvement would be <20%, # so it's not worth it... sub aesni_generate4 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-3] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt4,\@abi-omnipotent .align 16 _aesni_${dir}rypt4: .cfi_startproc $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 xorps $rndkey0,$inout2 xorps $rndkey0,$inout3 $movkey 32($key),$rndkey0 lea 32($key,$rounds),$key neg %rax # $rounds .byte 0x0f,0x1f,0x00 add \$16,%rax .L${dir}_loop4: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop4 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 aes${dir}last $rndkey0,$inout2 aes${dir}last $rndkey0,$inout3 ret .cfi_endproc .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 ___ } sub aesni_generate6 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-5] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt6,\@abi-omnipotent .align 16 _aesni_${dir}rypt6: .cfi_startproc $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 pxor $rndkey0,$inout1 pxor $rndkey0,$inout2 aes${dir} $rndkey1,$inout0 lea 32($key,$rounds),$key neg %rax # $rounds aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout3 pxor $rndkey0,$inout4 aes${dir} $rndkey1,$inout2 pxor $rndkey0,$inout5 $movkey ($key,%rax),$rndkey0 add \$16,%rax jmp .L${dir}_loop6_enter .align 16 .L${dir}_loop6: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 .L${dir}_loop6_enter: aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 aes${dir} $rndkey0,$inout4 aes${dir} $rndkey0,$inout5 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop6 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 aes${dir}last $rndkey0,$inout2 aes${dir}last $rndkey0,$inout3 aes${dir}last $rndkey0,$inout4 aes${dir}last $rndkey0,$inout5 ret .cfi_endproc .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 ___ } sub aesni_generate8 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* # preserved. $inout[0-7] is cipher/clear text... $code.=<<___; .type _aesni_${dir}rypt8,\@abi-omnipotent .align 16 _aesni_${dir}rypt8: .cfi_startproc $movkey ($key),$rndkey0 shl \$4,$rounds $movkey 16($key),$rndkey1 xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 pxor $rndkey0,$inout2 pxor $rndkey0,$inout3 pxor $rndkey0,$inout4 lea 32($key,$rounds),$key neg %rax # $rounds aes${dir} $rndkey1,$inout0 pxor $rndkey0,$inout5 pxor $rndkey0,$inout6 aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout7 $movkey ($key,%rax),$rndkey0 add \$16,%rax jmp .L${dir}_loop8_inner .align 16 .L${dir}_loop8: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 .L${dir}_loop8_inner: aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 aes${dir} $rndkey1,$inout6 aes${dir} $rndkey1,$inout7 .L${dir}_loop8_enter: $movkey ($key,%rax),$rndkey1 add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 aes${dir} $rndkey0,$inout4 aes${dir} $rndkey0,$inout5 aes${dir} $rndkey0,$inout6 aes${dir} $rndkey0,$inout7 $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop8 aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 aes${dir} $rndkey1,$inout6 aes${dir} $rndkey1,$inout7 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 aes${dir}last $rndkey0,$inout2 aes${dir}last $rndkey0,$inout3 aes${dir}last $rndkey0,$inout4 aes${dir}last $rndkey0,$inout5 aes${dir}last $rndkey0,$inout6 aes${dir}last $rndkey0,$inout7 ret .cfi_endproc .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 ___ } &aesni_generate2("enc") if ($PREFIX eq "aes_hw"); &aesni_generate3("enc") if ($PREFIX eq "aes_hw"); &aesni_generate4("enc") if ($PREFIX eq "aes_hw"); &aesni_generate6("enc") if ($PREFIX eq "aes_hw"); &aesni_generate8("enc") if ($PREFIX eq "aes_hw");