^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) #!/usr/bin/env perl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) # project.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) # This module implements Poly1305 hash for ARMv8.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) # June 2015
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) # Numbers are cycles per processed byte with poly1305_blocks alone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) # IALU/gcc-4.9 NEON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) # Apple A7 1.86/+5% 0.72
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) # Cortex-A53 2.69/+58% 1.47
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) # Cortex-A57 2.70/+7% 1.14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) # Denver 1.64/+50% 1.18(*)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) # X-Gene 2.13/+68% 2.27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) # Mongoose 1.77/+75% 1.12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) # Kryo 2.70/+55% 1.13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) # ThunderX2 1.17/+95% 1.36
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) # (*) estimate based on resources availability is less than 1.0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) # i.e. measured result is worse than expected, presumably binary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) # translator is not almighty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) $flavour=shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) $output=shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) if ($flavour && $flavour ne "void") {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) die "can't locate arm-xlate.pl";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) open STDOUT,"| \"$^X\" $xlate $flavour $output";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) open STDOUT,">$output";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) my ($mac,$nonce)=($inp,$len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #ifndef __KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) # include "arm_arch.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) .extern OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) // forward "declarations" are required for Apple
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) .globl poly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) .globl poly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) .globl poly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) .type poly1305_init,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) poly1305_init:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) cmp $inp,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) stp xzr,xzr,[$ctx] // zero hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) csel x0,xzr,x0,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) b.eq .Lno_key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) #ifndef __KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) adrp x17,OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) ldp $r0,$r1,[$inp] // load key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) mov $s1,#0xfffffffc0fffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) movk $s1,#0x0fff,lsl#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) rev $r0,$r0 // flip bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) rev $r1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) and $r0,$r0,$s1 // &=0ffffffc0fffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) and $s1,$s1,#-4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) and $r1,$r1,$s1 // &=0ffffffc0ffffffc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) mov w#$s1,#-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) stp $r0,$r1,[$ctx,#32] // save key value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) str w#$s1,[$ctx,#48] // impossible key power value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) #ifndef __KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) tst w17,#ARMV7_NEON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) adr $d0,.Lpoly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) adr $r0,.Lpoly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) adr $d1,.Lpoly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) csel $d0,$d0,$r0,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) # ifdef __ILP32__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) stp w#$d0,w#$d1,[$len]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) # else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) stp $d0,$d1,[$len]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) mov x0,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) .Lno_key:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) .size poly1305_init,.-poly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) .type poly1305_blocks,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) poly1305_blocks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) .Lpoly1305_blocks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) ands $len,$len,#-16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) b.eq .Lno_data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) ldp $h0,$h1,[$ctx] // load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) ldp $r0,$r1,[$ctx,#32] // load key value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) lsr $d0,$h0,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) mov w#$d1,w#$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) lsr $d2,$h1,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) mov w15,w#$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) lsr x16,$h2,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) mov w#$d0,w#$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) lsr $d1,$h0,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) mov w#$d2,w#$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) lsr x15,$h1,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) mov w16,w#$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) lsr $d1,$d2,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) adds $d0,$d0,$d2,lsl#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) add $d1,$d1,x15,lsl#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) adc $d1,$d1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) lsr $d2,x16,#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) adds $d1,$d1,x16,lsl#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) adc $d2,$d2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) cmp x17,#0 // is_base2_26?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) csel $h0,$h0,$d0,eq // choose between radixes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) csel $h1,$h1,$d1,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) csel $h2,$h2,$d2,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) .Loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) ldp $t0,$t1,[$inp],#16 // load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) sub $len,$len,#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) rev $t0,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) rev $t1,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) adds $h0,$h0,$t0 // accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) adcs $h1,$h1,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) mul $d0,$h0,$r0 // h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) adc $h2,$h2,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) umulh $d1,$h0,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) mul $t0,$h1,$s1 // h1*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) umulh $t1,$h1,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) adds $d0,$d0,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) mul $t0,$h0,$r1 // h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) adc $d1,$d1,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) umulh $d2,$h0,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) adds $d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) mul $t0,$h1,$r0 // h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) adc $d2,$d2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) umulh $t1,$h1,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) adds $d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) mul $t0,$h2,$s1 // h2*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) adc $d2,$d2,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) mul $t1,$h2,$r0 // h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) adds $d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) adc $d2,$d2,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) and $t0,$d2,#-4 // final reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) and $h2,$d2,#3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) add $t0,$t0,$d2,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) adds $h0,$d0,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) adcs $h1,$d1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) adc $h2,$h2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) cbnz $len,.Loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) stp $h0,$h1,[$ctx] // store hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) .Lno_data:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) .size poly1305_blocks,.-poly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) .type poly1305_emit,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) poly1305_emit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) .Lpoly1305_emit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) ldp $h0,$h1,[$ctx] // load hash base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) ldp $t0,$t1,[$nonce] // load nonce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) lsr $d0,$h0,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) mov w#$d1,w#$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) lsr $d2,$h1,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) mov w15,w#$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) lsr x16,$h2,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) mov w#$d0,w#$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) lsr $d1,$h0,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) mov w#$d2,w#$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) lsr x15,$h1,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) mov w16,w#$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) lsr $d1,$d2,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) adds $d0,$d0,$d2,lsl#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) add $d1,$d1,x15,lsl#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) adc $d1,$d1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) lsr $d2,x16,#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) adds $d1,$d1,x16,lsl#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) adc $d2,$d2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) cmp $r0,#0 // is_base2_26?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) csel $h0,$h0,$d0,eq // choose between radixes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) csel $h1,$h1,$d1,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) csel $h2,$h2,$d2,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) adds $d0,$h0,#5 // compare to modulus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) adcs $d1,$h1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) adc $d2,$h2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) tst $d2,#-4 // see if it's carried/borrowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) csel $h0,$h0,$d0,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) csel $h1,$h1,$d1,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) ror $t0,$t0,#32 // flip nonce words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) ror $t1,$t1,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) adds $h0,$h0,$t0 // accumulate nonce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) adc $h1,$h1,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) rev $h0,$h0 // flip output bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) rev $h1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) stp $h0,$h1,[$mac] // write result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) .size poly1305_emit,.-poly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) my ($T0,$T1,$MASK) = map("v$_",(29..31));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) my ($in2,$zeros)=("x16","x17");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) my $is_base2_26 = $zeros; # borrow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) .type poly1305_mult,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) poly1305_mult:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) mul $d0,$h0,$r0 // h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) umulh $d1,$h0,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) mul $t0,$h1,$s1 // h1*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) umulh $t1,$h1,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) adds $d0,$d0,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) mul $t0,$h0,$r1 // h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) adc $d1,$d1,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) umulh $d2,$h0,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) adds $d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) mul $t0,$h1,$r0 // h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) adc $d2,$d2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) umulh $t1,$h1,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) adds $d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) mul $t0,$h2,$s1 // h2*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) adc $d2,$d2,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) mul $t1,$h2,$r0 // h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) adds $d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) adc $d2,$d2,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) and $t0,$d2,#-4 // final reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) and $h2,$d2,#3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) add $t0,$t0,$d2,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) adds $h0,$d0,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) adcs $h1,$d1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) adc $h2,$h2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) .size poly1305_mult,.-poly1305_mult
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) .type poly1305_splat,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) poly1305_splat:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) ubfx x13,$h0,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) extr x14,$h1,$h0,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) and x14,x14,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) ubfx x15,$h1,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) extr x16,$h2,$h1,#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) str w12,[$ctx,#16*0] // r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) add w12,w13,w13,lsl#2 // r1*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) str w13,[$ctx,#16*1] // r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) add w13,w14,w14,lsl#2 // r2*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) str w12,[$ctx,#16*2] // s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) str w14,[$ctx,#16*3] // r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) add w14,w15,w15,lsl#2 // r3*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) str w13,[$ctx,#16*4] // s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) str w15,[$ctx,#16*5] // r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) add w15,w16,w16,lsl#2 // r4*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) str w14,[$ctx,#16*6] // s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) str w16,[$ctx,#16*7] // r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) str w15,[$ctx,#16*8] // s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) .size poly1305_splat,.-poly1305_splat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) #ifdef __KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) .globl poly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) .type poly1305_blocks_neon,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) poly1305_blocks_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) .Lpoly1305_blocks_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) ldr $is_base2_26,[$ctx,#24]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) cmp $len,#128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) b.lo .Lpoly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) .inst 0xd503233f // paciasp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) stp x29,x30,[sp,#-80]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) add x29,sp,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) stp d8,d9,[sp,#16] // meet ABI requirements
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) stp d10,d11,[sp,#32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) stp d12,d13,[sp,#48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) stp d14,d15,[sp,#64]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) cbz $is_base2_26,.Lbase2_64_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) ldp w10,w11,[$ctx] // load hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) ldp w12,w13,[$ctx,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) ldr w14,[$ctx,#16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) tst $len,#31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) b.eq .Leven_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) ldp $r0,$r1,[$ctx,#32] // load key value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) lsr $h1,x12,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) adds $h0,$h0,x12,lsl#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) add $h1,$h1,x13,lsl#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) adc $h1,$h1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) lsr $h2,x14,#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) adds $h1,$h1,x14,lsl#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) adc $d2,$h2,xzr // can be partially reduced...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) ldp $d0,$d1,[$inp],#16 // load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) sub $len,$len,#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) rev $d0,$d0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) rev $d1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) adds $h0,$h0,$d0 // accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) adcs $h1,$h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) adc $h2,$h2,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) bl poly1305_mult
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) ubfx x11,$h0,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) extr x12,$h1,$h0,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) and x12,x12,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) ubfx x13,$h1,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) extr x14,$h2,$h1,#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) b .Leven_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) .Lbase2_64_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) ldp $r0,$r1,[$ctx,#32] // load key value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) ldp $h0,$h1,[$ctx] // load hash value base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) ldr $h2,[$ctx,#16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) tst $len,#31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) b.eq .Linit_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) ldp $d0,$d1,[$inp],#16 // load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) sub $len,$len,#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) rev $d0,$d0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) rev $d1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) adds $h0,$h0,$d0 // accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) adcs $h1,$h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) adc $h2,$h2,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) bl poly1305_mult
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) .Linit_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) ldr w17,[$ctx,#48] // first table element
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) ubfx x11,$h0,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) extr x12,$h1,$h0,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) and x12,x12,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) ubfx x13,$h1,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) extr x14,$h2,$h1,#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) cmp w17,#-1 // is value impossible?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) b.ne .Leven_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) fmov ${H0},x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) fmov ${H1},x11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) fmov ${H2},x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) fmov ${H3},x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) fmov ${H4},x14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) ////////////////////////////////// initialize r^n table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) mov $h0,$r0 // r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) mov $h1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) mov $h2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) add $ctx,$ctx,#48+12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) bl poly1305_splat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) bl poly1305_mult // r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) sub $ctx,$ctx,#4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) bl poly1305_splat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) bl poly1305_mult // r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) sub $ctx,$ctx,#4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) bl poly1305_splat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) bl poly1305_mult // r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) sub $ctx,$ctx,#4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) bl poly1305_splat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) sub $ctx,$ctx,#48 // restore original $ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) b .Ldo_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) .Leven_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) fmov ${H0},x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) fmov ${H1},x11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) fmov ${H2},x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) fmov ${H3},x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) fmov ${H4},x14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) .Ldo_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) ldp x8,x12,[$inp,#32] // inp[2:3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) subs $len,$len,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) ldp x9,x13,[$inp,#48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) add $in2,$inp,#96
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) adr $zeros,.Lzeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) lsl $padbit,$padbit,#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) add x15,$ctx,#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) rev x8,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) rev x12,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) rev x9,x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) rev x13,x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) and x5,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) ubfx x6,x8,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) ubfx x7,x9,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) extr x8,x12,x8,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) extr x9,x13,x9,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) fmov $IN23_0,x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) and x8,x8,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) and x9,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) ubfx x10,x12,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) ubfx x11,x13,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) add x12,$padbit,x12,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) add x13,$padbit,x13,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) fmov $IN23_1,x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) fmov $IN23_2,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) fmov $IN23_3,x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) fmov $IN23_4,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) ldp x8,x12,[$inp],#16 // inp[0:1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) ldp x9,x13,[$inp],#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) ld1 {$R0,$R1,$S1,$R2},[x15],#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) ld1 {$S2,$R3,$S3,$R4},[x15],#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) ld1 {$S4},[x15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) rev x8,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) rev x12,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) rev x9,x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) rev x13,x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) and x5,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) ubfx x6,x8,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) ubfx x7,x9,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) extr x8,x12,x8,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) extr x9,x13,x9,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) fmov $IN01_0,x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) and x8,x8,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) and x9,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) ubfx x10,x12,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) ubfx x11,x13,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) add x12,$padbit,x12,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) add x13,$padbit,x13,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) fmov $IN01_1,x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) movi $MASK.2d,#-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) fmov $IN01_2,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) fmov $IN01_3,x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) fmov $IN01_4,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) ushr $MASK.2d,$MASK.2d,#38
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) b.ls .Lskip_loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) .Loop_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) ////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) // \___________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) // \___________________/ \____________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) // Note that we start with inp[2:3]*r^2. This is because it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) // doesn't depend on reduction in previous iteration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) ////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) subs $len,$len,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) umull $ACC4,$IN23_0,${R4}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) csel $in2,$zeros,$in2,lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) umull $ACC3,$IN23_0,${R3}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) umull $ACC2,$IN23_0,${R2}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) umull $ACC1,$IN23_0,${R1}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) ldp x9,x13,[$in2],#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) umull $ACC0,$IN23_0,${R0}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) rev x8,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) rev x12,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) rev x9,x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) rev x13,x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) umlal $ACC4,$IN23_1,${R3}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) umlal $ACC3,$IN23_1,${R2}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) and x5,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) umlal $ACC2,$IN23_1,${R1}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) ubfx x6,x8,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) umlal $ACC1,$IN23_1,${R0}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) ubfx x7,x9,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) umlal $ACC0,$IN23_1,${S4}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) umlal $ACC4,$IN23_2,${R2}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) extr x8,x12,x8,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) umlal $ACC3,$IN23_2,${R1}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) extr x9,x13,x9,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) umlal $ACC2,$IN23_2,${R0}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) umlal $ACC1,$IN23_2,${S4}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) fmov $IN23_0,x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) umlal $ACC0,$IN23_2,${S3}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) and x8,x8,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) umlal $ACC4,$IN23_3,${R1}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) and x9,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) umlal $ACC3,$IN23_3,${R0}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) ubfx x10,x12,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) umlal $ACC2,$IN23_3,${S4}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) ubfx x11,x13,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) umlal $ACC1,$IN23_3,${S3}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) umlal $ACC0,$IN23_3,${S2}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) fmov $IN23_1,x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) add $IN01_2,$IN01_2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) add x12,$padbit,x12,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) umlal $ACC4,$IN23_4,${R0}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) add x13,$padbit,x13,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) umlal $ACC3,$IN23_4,${S4}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) umlal $ACC2,$IN23_4,${S3}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) umlal $ACC1,$IN23_4,${S2}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) fmov $IN23_2,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) umlal $ACC0,$IN23_4,${S1}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) fmov $IN23_3,x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) ////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) // (hash+inp[0:1])*r^4 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) add $IN01_0,$IN01_0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) fmov $IN23_4,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) umlal $ACC3,$IN01_2,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) ldp x8,x12,[$inp],#16 // inp[0:1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) umlal $ACC0,$IN01_2,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) ldp x9,x13,[$inp],#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) umlal $ACC4,$IN01_2,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) umlal $ACC1,$IN01_2,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) umlal $ACC2,$IN01_2,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) #ifdef __AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) rev x8,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) rev x12,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) rev x9,x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) rev x13,x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) add $IN01_1,$IN01_1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) umlal $ACC3,$IN01_0,${R3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) umlal $ACC4,$IN01_0,${R4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) umlal $ACC2,$IN01_0,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) and x5,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) umlal $ACC0,$IN01_0,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) ubfx x6,x8,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) umlal $ACC1,$IN01_0,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) ubfx x7,x9,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) add $IN01_3,$IN01_3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) umlal $ACC3,$IN01_1,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) extr x8,x12,x8,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) umlal $ACC4,$IN01_1,${R3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) extr x9,x13,x9,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) umlal $ACC0,$IN01_1,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) umlal $ACC2,$IN01_1,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) fmov $IN01_0,x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) umlal $ACC1,$IN01_1,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) and x8,x8,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) add $IN01_4,$IN01_4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) and x9,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) umlal $ACC3,$IN01_3,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) ubfx x10,x12,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) umlal $ACC0,$IN01_3,${S2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) ubfx x11,x13,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) umlal $ACC4,$IN01_3,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) umlal $ACC1,$IN01_3,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) fmov $IN01_1,x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) umlal $ACC2,$IN01_3,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) add x12,$padbit,x12,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) umlal $ACC3,$IN01_4,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) add x13,$padbit,x13,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) umlal $ACC0,$IN01_4,${S1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) umlal $ACC4,$IN01_4,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) umlal $ACC1,$IN01_4,${S2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) fmov $IN01_2,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) umlal $ACC2,$IN01_4,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) fmov $IN01_3,x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) fmov $IN01_4,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) /////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) // and P. Schwabe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) // [see discussion in poly1305-armv4 module]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) ushr $T0.2d,$ACC3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) xtn $H3,$ACC3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) ushr $T1.2d,$ACC0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) and $ACC0,$ACC0,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) add $ACC4,$ACC4,$T0.2d // h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) bic $H3,#0xfc,lsl#24 // &=0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) add $ACC1,$ACC1,$T1.2d // h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) ushr $T0.2d,$ACC4,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) xtn $H4,$ACC4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) ushr $T1.2d,$ACC1,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) xtn $H1,$ACC1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) bic $H4,#0xfc,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) add $ACC2,$ACC2,$T1.2d // h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) add $ACC0,$ACC0,$T0.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) shl $T0.2d,$T0.2d,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) shrn $T1.2s,$ACC2,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) xtn $H2,$ACC2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) add $ACC0,$ACC0,$T0.2d // h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) bic $H1,#0xfc,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) add $H3,$H3,$T1.2s // h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) bic $H2,#0xfc,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) shrn $T0.2s,$ACC0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) xtn $H0,$ACC0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) ushr $T1.2s,$H3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) bic $H3,#0xfc,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) bic $H0,#0xfc,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) add $H1,$H1,$T0.2s // h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) add $H4,$H4,$T1.2s // h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) b.hi .Loop_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) .Lskip_loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) dup $IN23_2,${IN23_2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) add $IN01_2,$IN01_2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) ////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) adds $len,$len,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) b.ne .Long_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) dup $IN23_2,${IN01_2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) add $IN23_0,$IN01_0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) add $IN23_3,$IN01_3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) add $IN23_1,$IN01_1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) add $IN23_4,$IN01_4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) .Long_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) dup $IN23_0,${IN23_0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) umull2 $ACC0,$IN23_2,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) umull2 $ACC3,$IN23_2,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) umull2 $ACC4,$IN23_2,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) umull2 $ACC2,$IN23_2,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) umull2 $ACC1,$IN23_2,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) dup $IN23_1,${IN23_1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) umlal2 $ACC0,$IN23_0,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) umlal2 $ACC2,$IN23_0,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) umlal2 $ACC3,$IN23_0,${R3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) umlal2 $ACC4,$IN23_0,${R4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) umlal2 $ACC1,$IN23_0,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) dup $IN23_3,${IN23_3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) umlal2 $ACC0,$IN23_1,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) umlal2 $ACC3,$IN23_1,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) umlal2 $ACC2,$IN23_1,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) umlal2 $ACC4,$IN23_1,${R3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) umlal2 $ACC1,$IN23_1,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) dup $IN23_4,${IN23_4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) umlal2 $ACC3,$IN23_3,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) umlal2 $ACC4,$IN23_3,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) umlal2 $ACC0,$IN23_3,${S2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) umlal2 $ACC1,$IN23_3,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) umlal2 $ACC2,$IN23_3,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) umlal2 $ACC3,$IN23_4,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) umlal2 $ACC0,$IN23_4,${S1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) umlal2 $ACC4,$IN23_4,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) umlal2 $ACC1,$IN23_4,${S2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) umlal2 $ACC2,$IN23_4,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) b.eq .Lshort_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) ////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) // (hash+inp[0:1])*r^4:r^3 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) add $IN01_0,$IN01_0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) umlal $ACC3,$IN01_2,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) umlal $ACC0,$IN01_2,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) umlal $ACC4,$IN01_2,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) umlal $ACC1,$IN01_2,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) umlal $ACC2,$IN01_2,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) add $IN01_1,$IN01_1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) umlal $ACC3,$IN01_0,${R3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) umlal $ACC0,$IN01_0,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) umlal $ACC4,$IN01_0,${R4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) umlal $ACC1,$IN01_0,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) umlal $ACC2,$IN01_0,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) add $IN01_3,$IN01_3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) umlal $ACC3,$IN01_1,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) umlal $ACC0,$IN01_1,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) umlal $ACC4,$IN01_1,${R3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) umlal $ACC1,$IN01_1,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) umlal $ACC2,$IN01_1,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) add $IN01_4,$IN01_4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) umlal $ACC3,$IN01_3,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) umlal $ACC0,$IN01_3,${S2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) umlal $ACC4,$IN01_3,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) umlal $ACC1,$IN01_3,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) umlal $ACC2,$IN01_3,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) umlal $ACC3,$IN01_4,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) umlal $ACC0,$IN01_4,${S1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) umlal $ACC4,$IN01_4,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) umlal $ACC1,$IN01_4,${S2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) umlal $ACC2,$IN01_4,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) .Lshort_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) ////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) // horizontal add
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) addp $ACC3,$ACC3,$ACC3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) ldp d8,d9,[sp,#16] // meet ABI requirements
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) addp $ACC0,$ACC0,$ACC0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) ldp d10,d11,[sp,#32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) addp $ACC4,$ACC4,$ACC4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) ldp d12,d13,[sp,#48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) addp $ACC1,$ACC1,$ACC1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) ldp d14,d15,[sp,#64]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) addp $ACC2,$ACC2,$ACC2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) ldr x30,[sp,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) ////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) // lazy reduction, but without narrowing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) ushr $T0.2d,$ACC3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) and $ACC3,$ACC3,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) ushr $T1.2d,$ACC0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) and $ACC0,$ACC0,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) add $ACC4,$ACC4,$T0.2d // h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) add $ACC1,$ACC1,$T1.2d // h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) ushr $T0.2d,$ACC4,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) and $ACC4,$ACC4,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) ushr $T1.2d,$ACC1,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) and $ACC1,$ACC1,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) add $ACC2,$ACC2,$T1.2d // h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) add $ACC0,$ACC0,$T0.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) shl $T0.2d,$T0.2d,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) ushr $T1.2d,$ACC2,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) and $ACC2,$ACC2,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) add $ACC0,$ACC0,$T0.2d // h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) add $ACC3,$ACC3,$T1.2d // h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) ushr $T0.2d,$ACC0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) and $ACC0,$ACC0,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) ushr $T1.2d,$ACC3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) and $ACC3,$ACC3,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) add $ACC1,$ACC1,$T0.2d // h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) add $ACC4,$ACC4,$T1.2d // h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) ////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) // write the result, can be partially reduced
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) mov x4,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) st1 {$ACC4}[0],[$ctx]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) str x4,[$ctx,#8] // set is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) ldr x29,[sp],#80
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) .inst 0xd50323bf // autiasp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) .size poly1305_blocks_neon,.-poly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) .Lzeros:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) .long 0,0,0,0,0,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) .asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) .align 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) #if !defined(__KERNEL__) && !defined(_WIN64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) .comm OPENSSL_armcap_P,4,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) .hidden OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) foreach (split("\n",$code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) s/\.[124]([sd])\[/.$1\[/;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) s/w#x([0-9]+)/w$1/g;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) print $_,"\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) close STDOUT;