Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) #!/usr/bin/env perl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5) # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6) # project.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9) # This module implements Poly1305 hash for ARMv8.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11) # June 2015
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13) # Numbers are cycles per processed byte with poly1305_blocks alone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15) #		IALU/gcc-4.9	NEON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17) # Apple A7	1.86/+5%	0.72
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18) # Cortex-A53	2.69/+58%	1.47
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) # Cortex-A57	2.70/+7%	1.14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) # Denver	1.64/+50%	1.18(*)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) # X-Gene	2.13/+68%	2.27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) # Mongoose	1.77/+75%	1.12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) # Kryo		2.70/+55%	1.13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) # ThunderX2	1.17/+95%	1.36
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) # (*)	estimate based on resources availability is less than 1.0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) #	i.e. measured result is worse than expected, presumably binary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) #	translator is not almighty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) $flavour=shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) $output=shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) if ($flavour && $flavour ne "void") {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34)     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35)     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36)     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37)     die "can't locate arm-xlate.pl";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39)     open STDOUT,"| \"$^X\" $xlate $flavour $output";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41)     open STDOUT,">$output";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) my ($mac,$nonce)=($inp,$len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) #ifndef __KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) # include "arm_arch.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) .extern	OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) // forward "declarations" are required for Apple
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) .globl	poly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) .globl	poly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) .globl	poly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) .type	poly1305_init,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) poly1305_init:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) 	cmp	$inp,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) 	stp	xzr,xzr,[$ctx]		// zero hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) 	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69) 	csel	x0,xzr,x0,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) 	b.eq	.Lno_key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) #ifndef	__KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) 	adrp	x17,OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) 	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) 	ldp	$r0,$r1,[$inp]		// load key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) 	mov	$s1,#0xfffffffc0fffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) 	movk	$s1,#0x0fff,lsl#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) 	rev	$r0,$r0			// flip bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) 	rev	$r1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) 	and	$r0,$r0,$s1		// &=0ffffffc0fffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) 	and	$s1,$s1,#-4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) 	and	$r1,$r1,$s1		// &=0ffffffc0ffffffc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) 	mov	w#$s1,#-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) 	stp	$r0,$r1,[$ctx,#32]	// save key value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) 	str	w#$s1,[$ctx,#48]	// impossible key power value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) #ifndef	__KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) 	tst	w17,#ARMV7_NEON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) 	adr	$d0,.Lpoly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) 	adr	$r0,.Lpoly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) 	adr	$d1,.Lpoly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) 	csel	$d0,$d0,$r0,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) # ifdef	__ILP32__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) 	stp	w#$d0,w#$d1,[$len]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) # else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) 	stp	$d0,$d1,[$len]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) 	mov	x0,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) .Lno_key:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) .size	poly1305_init,.-poly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) .type	poly1305_blocks,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) poly1305_blocks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) .Lpoly1305_blocks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) 	ands	$len,$len,#-16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 	b.eq	.Lno_data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) 	ldp	$h0,$h1,[$ctx]		// load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) 	ldp	$h2,x17,[$ctx,#16]	// [along with is_base2_26]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) 	ldp	$r0,$r1,[$ctx,#32]	// load key value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) 	lsr	$d0,$h0,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) 	mov	w#$d1,w#$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) 	lsr	$d2,$h1,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) 	mov	w15,w#$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) 	lsr	x16,$h2,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) 	mov	w#$d0,w#$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) 	lsr	$d1,$h0,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) 	mov	w#$d2,w#$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) 	lsr	x15,$h1,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) 	mov	w16,w#$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) 	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) 	lsr	$d1,$d2,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) 	adds	$d0,$d0,$d2,lsl#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) 	add	$d1,$d1,x15,lsl#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) 	adc	$d1,$d1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) 	lsr	$d2,x16,#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) 	adds	$d1,$d1,x16,lsl#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) 	adc	$d2,$d2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) 	cmp	x17,#0			// is_base2_26?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) 	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) 	csel	$h0,$h0,$d0,eq		// choose between radixes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 	csel	$h1,$h1,$d1,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 	csel	$h2,$h2,$d2,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) .Loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 	ldp	$t0,$t1,[$inp],#16	// load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) 	sub	$len,$len,#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) 	rev	$t0,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 	rev	$t1,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) 	adds	$h0,$h0,$t0		// accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) 	adcs	$h1,$h1,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) 	mul	$d0,$h0,$r0		// h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) 	adc	$h2,$h2,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 	umulh	$d1,$h0,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) 	mul	$t0,$h1,$s1		// h1*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) 	umulh	$t1,$h1,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 	adds	$d0,$d0,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) 	mul	$t0,$h0,$r1		// h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) 	adc	$d1,$d1,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 	umulh	$d2,$h0,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) 	adds	$d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) 	mul	$t0,$h1,$r0		// h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 	adc	$d2,$d2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) 	umulh	$t1,$h1,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) 	adds	$d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 	mul	$t0,$h2,$s1		// h2*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) 	adc	$d2,$d2,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) 	mul	$t1,$h2,$r0		// h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 	adds	$d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) 	adc	$d2,$d2,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 	and	$t0,$d2,#-4		// final reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) 	and	$h2,$d2,#3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) 	add	$t0,$t0,$d2,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) 	adds	$h0,$d0,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) 	adcs	$h1,$d1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) 	adc	$h2,$h2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 	cbnz	$len,.Loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 	stp	$h0,$h1,[$ctx]		// store hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 	stp	$h2,xzr,[$ctx,#16]	// [and clear is_base2_26]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) .Lno_data:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) .size	poly1305_blocks,.-poly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) .type	poly1305_emit,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) poly1305_emit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) .Lpoly1305_emit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) 	ldp	$h2,$r0,[$ctx,#16]	// [along with is_base2_26]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 	ldp	$t0,$t1,[$nonce]	// load nonce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) 	lsr	$d0,$h0,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) 	mov	w#$d1,w#$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) 	lsr	$d2,$h1,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) 	mov	w15,w#$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) 	lsr	x16,$h2,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) 	mov	w#$d0,w#$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) 	lsr	$d1,$h0,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) 	mov	w#$d2,w#$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) 	lsr	x15,$h1,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) 	mov	w16,w#$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) 	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) 	lsr	$d1,$d2,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) 	adds	$d0,$d0,$d2,lsl#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) 	add	$d1,$d1,x15,lsl#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) 	adc	$d1,$d1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) 	lsr	$d2,x16,#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) 	adds	$d1,$d1,x16,lsl#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) 	adc	$d2,$d2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) 	cmp	$r0,#0			// is_base2_26?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) 	csel	$h0,$h0,$d0,eq		// choose between radixes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) 	csel	$h1,$h1,$d1,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) 	csel	$h2,$h2,$d2,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) 	adds	$d0,$h0,#5		// compare to modulus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) 	adcs	$d1,$h1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) 	adc	$d2,$h2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) 	tst	$d2,#-4			// see if it's carried/borrowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) 	csel	$h0,$h0,$d0,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) 	csel	$h1,$h1,$d1,eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) 	ror	$t0,$t0,#32		// flip nonce words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) 	ror	$t1,$t1,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) 	adds	$h0,$h0,$t0		// accumulate nonce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) 	adc	$h1,$h1,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) 	rev	$h0,$h0			// flip output bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) 	rev	$h1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) 	stp	$h0,$h1,[$mac]		// write result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) .size	poly1305_emit,.-poly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) my ($T0,$T1,$MASK) = map("v$_",(29..31));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) my ($in2,$zeros)=("x16","x17");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) my $is_base2_26 = $zeros;		# borrow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) .type	poly1305_mult,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) poly1305_mult:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) 	mul	$d0,$h0,$r0		// h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) 	umulh	$d1,$h0,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) 	mul	$t0,$h1,$s1		// h1*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) 	umulh	$t1,$h1,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) 	adds	$d0,$d0,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) 	mul	$t0,$h0,$r1		// h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) 	adc	$d1,$d1,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) 	umulh	$d2,$h0,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) 	adds	$d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) 	mul	$t0,$h1,$r0		// h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) 	adc	$d2,$d2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) 	umulh	$t1,$h1,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 	adds	$d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) 	mul	$t0,$h2,$s1		// h2*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 	adc	$d2,$d2,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) 	mul	$t1,$h2,$r0		// h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) 	adds	$d1,$d1,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) 	adc	$d2,$d2,$t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) 	and	$t0,$d2,#-4		// final reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) 	and	$h2,$d2,#3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) 	add	$t0,$t0,$d2,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) 	adds	$h0,$d0,$t0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) 	adcs	$h1,$d1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) 	adc	$h2,$h2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) .size	poly1305_mult,.-poly1305_mult
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) .type	poly1305_splat,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) .align	4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) poly1305_splat:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) 	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) 	ubfx	x13,$h0,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) 	extr	x14,$h1,$h0,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) 	and	x14,x14,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) 	ubfx	x15,$h1,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) 	extr	x16,$h2,$h1,#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) 	str	w12,[$ctx,#16*0]	// r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) 	add	w12,w13,w13,lsl#2	// r1*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) 	str	w13,[$ctx,#16*1]	// r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) 	add	w13,w14,w14,lsl#2	// r2*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) 	str	w12,[$ctx,#16*2]	// s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) 	str	w14,[$ctx,#16*3]	// r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) 	add	w14,w15,w15,lsl#2	// r3*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) 	str	w13,[$ctx,#16*4]	// s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) 	str	w15,[$ctx,#16*5]	// r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) 	add	w15,w16,w16,lsl#2	// r4*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) 	str	w14,[$ctx,#16*6]	// s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) 	str	w16,[$ctx,#16*7]	// r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) 	str	w15,[$ctx,#16*8]	// s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) .size	poly1305_splat,.-poly1305_splat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) #ifdef	__KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) .globl	poly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) .type	poly1305_blocks_neon,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) poly1305_blocks_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) .Lpoly1305_blocks_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) 	ldr	$is_base2_26,[$ctx,#24]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) 	cmp	$len,#128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) 	b.lo	.Lpoly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) 	.inst	0xd503233f		// paciasp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) 	stp	x29,x30,[sp,#-80]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) 	add	x29,sp,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) 	stp	d8,d9,[sp,#16]		// meet ABI requirements
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) 	stp	d10,d11,[sp,#32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) 	stp	d12,d13,[sp,#48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) 	stp	d14,d15,[sp,#64]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) 	cbz	$is_base2_26,.Lbase2_64_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) 	ldp	w10,w11,[$ctx]		// load hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) 	ldp	w12,w13,[$ctx,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) 	ldr	w14,[$ctx,#16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) 	tst	$len,#31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) 	b.eq	.Leven_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) 	ldp	$r0,$r1,[$ctx,#32]	// load key value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) 	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) 	lsr	$h1,x12,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) 	adds	$h0,$h0,x12,lsl#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) 	add	$h1,$h1,x13,lsl#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) 	adc	$h1,$h1,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) 	lsr	$h2,x14,#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) 	adds	$h1,$h1,x14,lsl#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) 	adc	$d2,$h2,xzr		// can be partially reduced...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) 	ldp	$d0,$d1,[$inp],#16	// load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) 	sub	$len,$len,#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) 	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) 	rev	$d0,$d0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) 	rev	$d1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) 	adds	$h0,$h0,$d0		// accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) 	adcs	$h1,$h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) 	adc	$h2,$h2,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) 	bl	poly1305_mult
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) 	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) 	ubfx	x11,$h0,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) 	extr	x12,$h1,$h0,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) 	and	x12,x12,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) 	ubfx	x13,$h1,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) 	extr	x14,$h2,$h1,#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) 	b	.Leven_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) .align	4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) .Lbase2_64_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) 	ldp	$r0,$r1,[$ctx,#32]	// load key value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) 	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) 	ldr	$h2,[$ctx,#16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) 	tst	$len,#31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) 	b.eq	.Linit_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) 	ldp	$d0,$d1,[$inp],#16	// load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) 	sub	$len,$len,#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) 	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) 	rev	$d0,$d0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) 	rev	$d1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) 	adds	$h0,$h0,$d0		// accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) 	adcs	$h1,$h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) 	adc	$h2,$h2,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) 	bl	poly1305_mult
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) .Linit_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) 	ldr	w17,[$ctx,#48]		// first table element
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) 	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) 	ubfx	x11,$h0,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) 	extr	x12,$h1,$h0,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) 	and	x12,x12,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) 	ubfx	x13,$h1,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) 	extr	x14,$h2,$h1,#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) 	cmp	w17,#-1			// is value impossible?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) 	b.ne	.Leven_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) 	fmov	${H0},x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) 	fmov	${H1},x11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) 	fmov	${H2},x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) 	fmov	${H3},x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) 	fmov	${H4},x14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) 	////////////////////////////////// initialize r^n table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) 	mov	$h0,$r0			// r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) 	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) 	mov	$h1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) 	mov	$h2,xzr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) 	add	$ctx,$ctx,#48+12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) 	bl	poly1305_splat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) 	bl	poly1305_mult		// r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) 	sub	$ctx,$ctx,#4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) 	bl	poly1305_splat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) 	bl	poly1305_mult		// r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) 	sub	$ctx,$ctx,#4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) 	bl	poly1305_splat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) 	bl	poly1305_mult		// r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) 	sub	$ctx,$ctx,#4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) 	bl	poly1305_splat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 	sub	$ctx,$ctx,#48		// restore original $ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) 	b	.Ldo_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) .align	4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) .Leven_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) 	fmov	${H0},x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) 	fmov	${H1},x11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) 	fmov	${H2},x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) 	fmov	${H3},x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) 	fmov	${H4},x14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) .Ldo_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) 	ldp	x8,x12,[$inp,#32]	// inp[2:3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) 	subs	$len,$len,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) 	ldp	x9,x13,[$inp,#48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) 	add	$in2,$inp,#96
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) 	adr	$zeros,.Lzeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) 	lsl	$padbit,$padbit,#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) 	add	x15,$ctx,#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) 	rev	x8,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) 	rev	x12,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) 	rev	x9,x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) 	rev	x13,x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) 	and	x5,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) 	ubfx	x6,x8,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) 	ubfx	x7,x9,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) 	extr	x8,x12,x8,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) 	extr	x9,x13,x9,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) 	fmov	$IN23_0,x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) 	and	x8,x8,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) 	and	x9,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) 	ubfx	x10,x12,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) 	ubfx	x11,x13,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) 	add	x12,$padbit,x12,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) 	add	x13,$padbit,x13,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) 	fmov	$IN23_1,x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) 	fmov	$IN23_2,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) 	fmov	$IN23_3,x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) 	fmov	$IN23_4,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) 	ldp	x8,x12,[$inp],#16	// inp[0:1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) 	ldp	x9,x13,[$inp],#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) 	ld1	{$R0,$R1,$S1,$R2},[x15],#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) 	ld1	{$S2,$R3,$S3,$R4},[x15],#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) 	ld1	{$S4},[x15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) 	rev	x8,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) 	rev	x12,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) 	rev	x9,x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) 	rev	x13,x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) 	and	x5,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) 	ubfx	x6,x8,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) 	ubfx	x7,x9,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) 	extr	x8,x12,x8,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) 	extr	x9,x13,x9,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) 	fmov	$IN01_0,x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) 	and	x8,x8,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) 	and	x9,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) 	ubfx	x10,x12,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) 	ubfx	x11,x13,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) 	add	x12,$padbit,x12,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) 	add	x13,$padbit,x13,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) 	fmov	$IN01_1,x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) 	movi	$MASK.2d,#-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) 	fmov	$IN01_2,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) 	fmov	$IN01_3,x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) 	fmov	$IN01_4,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) 	ushr	$MASK.2d,$MASK.2d,#38
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) 	b.ls	.Lskip_loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) .align	4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) .Loop_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) 	////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) 	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) 	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) 	//   \___________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) 	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) 	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) 	//   \___________________/ \____________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) 	// Note that we start with inp[2:3]*r^2. This is because it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) 	// doesn't depend on reduction in previous iteration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) 	////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) 	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) 	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) 	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) 	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) 	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) 	subs	$len,$len,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) 	umull	$ACC4,$IN23_0,${R4}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) 	csel	$in2,$zeros,$in2,lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) 	umull	$ACC3,$IN23_0,${R3}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) 	umull	$ACC2,$IN23_0,${R2}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) 	 ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) 	umull	$ACC1,$IN23_0,${R1}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) 	 ldp	x9,x13,[$in2],#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) 	umull	$ACC0,$IN23_0,${R0}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) 	 rev	x8,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) 	 rev	x12,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) 	 rev	x9,x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) 	 rev	x13,x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) 	umlal	$ACC4,$IN23_1,${R3}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) 	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) 	umlal	$ACC3,$IN23_1,${R2}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) 	 and	x5,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) 	umlal	$ACC2,$IN23_1,${R1}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) 	 ubfx	x6,x8,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) 	umlal	$ACC1,$IN23_1,${R0}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) 	 ubfx	x7,x9,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) 	umlal	$ACC0,$IN23_1,${S4}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) 	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) 	umlal	$ACC4,$IN23_2,${R2}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) 	 extr	x8,x12,x8,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) 	umlal	$ACC3,$IN23_2,${R1}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) 	 extr	x9,x13,x9,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) 	umlal	$ACC2,$IN23_2,${R0}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) 	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) 	umlal	$ACC1,$IN23_2,${S4}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) 	 fmov	$IN23_0,x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) 	umlal	$ACC0,$IN23_2,${S3}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) 	 and	x8,x8,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) 	umlal	$ACC4,$IN23_3,${R1}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) 	 and	x9,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) 	umlal	$ACC3,$IN23_3,${R0}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) 	 ubfx	x10,x12,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) 	umlal	$ACC2,$IN23_3,${S4}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) 	 ubfx	x11,x13,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) 	umlal	$ACC1,$IN23_3,${S3}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) 	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) 	umlal	$ACC0,$IN23_3,${S2}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) 	 fmov	$IN23_1,x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) 	add	$IN01_2,$IN01_2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) 	 add	x12,$padbit,x12,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) 	umlal	$ACC4,$IN23_4,${R0}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) 	 add	x13,$padbit,x13,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) 	umlal	$ACC3,$IN23_4,${S4}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) 	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) 	umlal	$ACC2,$IN23_4,${S3}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) 	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) 	umlal	$ACC1,$IN23_4,${S2}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) 	 fmov	$IN23_2,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) 	umlal	$ACC0,$IN23_4,${S1}[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) 	 fmov	$IN23_3,x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) 	////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) 	// (hash+inp[0:1])*r^4 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) 	add	$IN01_0,$IN01_0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) 	 fmov	$IN23_4,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) 	umlal	$ACC3,$IN01_2,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) 	 ldp	x8,x12,[$inp],#16	// inp[0:1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) 	umlal	$ACC0,$IN01_2,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) 	 ldp	x9,x13,[$inp],#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) 	umlal	$ACC4,$IN01_2,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) 	umlal	$ACC1,$IN01_2,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) 	umlal	$ACC2,$IN01_2,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) #ifdef	__AARCH64EB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) 	 rev	x8,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) 	 rev	x12,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) 	 rev	x9,x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) 	 rev	x13,x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) 	add	$IN01_1,$IN01_1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) 	umlal	$ACC3,$IN01_0,${R3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) 	umlal	$ACC4,$IN01_0,${R4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) 	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) 	umlal	$ACC2,$IN01_0,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) 	 and	x5,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) 	umlal	$ACC0,$IN01_0,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) 	 ubfx	x6,x8,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) 	umlal	$ACC1,$IN01_0,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) 	 ubfx	x7,x9,#26,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) 	add	$IN01_3,$IN01_3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) 	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) 	umlal	$ACC3,$IN01_1,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) 	 extr	x8,x12,x8,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) 	umlal	$ACC4,$IN01_1,${R3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) 	 extr	x9,x13,x9,#52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) 	umlal	$ACC0,$IN01_1,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) 	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) 	umlal	$ACC2,$IN01_1,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) 	 fmov	$IN01_0,x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) 	umlal	$ACC1,$IN01_1,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) 	 and	x8,x8,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) 	add	$IN01_4,$IN01_4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) 	 and	x9,x9,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) 	umlal	$ACC3,$IN01_3,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) 	 ubfx	x10,x12,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) 	umlal	$ACC0,$IN01_3,${S2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) 	 ubfx	x11,x13,#14,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) 	umlal	$ACC4,$IN01_3,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) 	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) 	umlal	$ACC1,$IN01_3,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) 	 fmov	$IN01_1,x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) 	umlal	$ACC2,$IN01_3,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) 	 add	x12,$padbit,x12,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) 	umlal	$ACC3,$IN01_4,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) 	 add	x13,$padbit,x13,lsr#40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) 	umlal	$ACC0,$IN01_4,${S1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) 	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) 	umlal	$ACC4,$IN01_4,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) 	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) 	umlal	$ACC1,$IN01_4,${S2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) 	 fmov	$IN01_2,x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) 	umlal	$ACC2,$IN01_4,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) 	 fmov	$IN01_3,x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) 	 fmov	$IN01_4,x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) 	/////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) 	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) 	// and P. Schwabe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) 	// [see discussion in poly1305-armv4 module]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) 	ushr	$T0.2d,$ACC3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) 	xtn	$H3,$ACC3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) 	 ushr	$T1.2d,$ACC0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) 	 and	$ACC0,$ACC0,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) 	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) 	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) 	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) 	ushr	$T0.2d,$ACC4,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) 	xtn	$H4,$ACC4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) 	 ushr	$T1.2d,$ACC1,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) 	 xtn	$H1,$ACC1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) 	bic	$H4,#0xfc,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) 	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) 	add	$ACC0,$ACC0,$T0.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) 	shl	$T0.2d,$T0.2d,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) 	 shrn	$T1.2s,$ACC2,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) 	 xtn	$H2,$ACC2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) 	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) 	 bic	$H1,#0xfc,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) 	 add	$H3,$H3,$T1.2s		// h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) 	 bic	$H2,#0xfc,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) 	shrn	$T0.2s,$ACC0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) 	xtn	$H0,$ACC0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) 	 ushr	$T1.2s,$H3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) 	 bic	$H3,#0xfc,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) 	 bic	$H0,#0xfc,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) 	add	$H1,$H1,$T0.2s		// h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) 	 add	$H4,$H4,$T1.2s		// h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) 	b.hi	.Loop_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) .Lskip_loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) 	dup	$IN23_2,${IN23_2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) 	add	$IN01_2,$IN01_2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) 	////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) 	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) 	adds	$len,$len,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) 	b.ne	.Long_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) 	dup	$IN23_2,${IN01_2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) 	add	$IN23_0,$IN01_0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) 	add	$IN23_3,$IN01_3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) 	add	$IN23_1,$IN01_1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) 	add	$IN23_4,$IN01_4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) .Long_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) 	dup	$IN23_0,${IN23_0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) 	umull2	$ACC0,$IN23_2,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) 	umull2	$ACC3,$IN23_2,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) 	umull2	$ACC4,$IN23_2,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) 	umull2	$ACC2,$IN23_2,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) 	umull2	$ACC1,$IN23_2,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) 	dup	$IN23_1,${IN23_1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) 	umlal2	$ACC0,$IN23_0,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) 	umlal2	$ACC2,$IN23_0,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) 	umlal2	$ACC3,$IN23_0,${R3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) 	umlal2	$ACC4,$IN23_0,${R4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) 	umlal2	$ACC1,$IN23_0,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) 	dup	$IN23_3,${IN23_3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) 	umlal2	$ACC0,$IN23_1,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) 	umlal2	$ACC3,$IN23_1,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) 	umlal2	$ACC2,$IN23_1,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) 	umlal2	$ACC4,$IN23_1,${R3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) 	umlal2	$ACC1,$IN23_1,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) 	dup	$IN23_4,${IN23_4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) 	umlal2	$ACC3,$IN23_3,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) 	umlal2	$ACC4,$IN23_3,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) 	umlal2	$ACC0,$IN23_3,${S2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) 	umlal2	$ACC1,$IN23_3,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) 	umlal2	$ACC2,$IN23_3,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) 	umlal2	$ACC3,$IN23_4,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) 	umlal2	$ACC0,$IN23_4,${S1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) 	umlal2	$ACC4,$IN23_4,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) 	umlal2	$ACC1,$IN23_4,${S2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) 	umlal2	$ACC2,$IN23_4,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) 	b.eq	.Lshort_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) 	////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) 	// (hash+inp[0:1])*r^4:r^3 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) 	add	$IN01_0,$IN01_0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) 	umlal	$ACC3,$IN01_2,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) 	umlal	$ACC0,$IN01_2,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) 	umlal	$ACC4,$IN01_2,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) 	umlal	$ACC1,$IN01_2,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) 	umlal	$ACC2,$IN01_2,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) 	add	$IN01_1,$IN01_1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) 	umlal	$ACC3,$IN01_0,${R3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) 	umlal	$ACC0,$IN01_0,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) 	umlal	$ACC4,$IN01_0,${R4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) 	umlal	$ACC1,$IN01_0,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) 	umlal	$ACC2,$IN01_0,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) 	add	$IN01_3,$IN01_3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) 	umlal	$ACC3,$IN01_1,${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) 	umlal	$ACC0,$IN01_1,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) 	umlal	$ACC4,$IN01_1,${R3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) 	umlal	$ACC1,$IN01_1,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) 	umlal	$ACC2,$IN01_1,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) 	add	$IN01_4,$IN01_4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) 	umlal	$ACC3,$IN01_3,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) 	umlal	$ACC0,$IN01_3,${S2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) 	umlal	$ACC4,$IN01_3,${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) 	umlal	$ACC1,$IN01_3,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) 	umlal	$ACC2,$IN01_3,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) 	umlal	$ACC3,$IN01_4,${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) 	umlal	$ACC0,$IN01_4,${S1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) 	umlal	$ACC4,$IN01_4,${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) 	umlal	$ACC1,$IN01_4,${S2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) 	umlal	$ACC2,$IN01_4,${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) .Lshort_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) 	////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) 	// horizontal add
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) 	addp	$ACC3,$ACC3,$ACC3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) 	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) 	addp	$ACC0,$ACC0,$ACC0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) 	 ldp	d10,d11,[sp,#32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) 	addp	$ACC4,$ACC4,$ACC4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) 	 ldp	d12,d13,[sp,#48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) 	addp	$ACC1,$ACC1,$ACC1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) 	 ldp	d14,d15,[sp,#64]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) 	addp	$ACC2,$ACC2,$ACC2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) 	 ldr	x30,[sp,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) 	////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) 	// lazy reduction, but without narrowing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) 	ushr	$T0.2d,$ACC3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) 	and	$ACC3,$ACC3,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) 	 ushr	$T1.2d,$ACC0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) 	 and	$ACC0,$ACC0,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) 	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) 	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) 	ushr	$T0.2d,$ACC4,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) 	and	$ACC4,$ACC4,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) 	 ushr	$T1.2d,$ACC1,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) 	 and	$ACC1,$ACC1,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) 	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) 	add	$ACC0,$ACC0,$T0.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) 	shl	$T0.2d,$T0.2d,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) 	 ushr	$T1.2d,$ACC2,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) 	 and	$ACC2,$ACC2,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) 	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) 	 add	$ACC3,$ACC3,$T1.2d	// h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) 	ushr	$T0.2d,$ACC0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) 	and	$ACC0,$ACC0,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) 	 ushr	$T1.2d,$ACC3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) 	 and	$ACC3,$ACC3,$MASK.2d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) 	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) 	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) 	////////////////////////////////////////////////////////////////
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) 	// write the result, can be partially reduced
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) 	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) 	mov	x4,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) 	st1	{$ACC4}[0],[$ctx]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) 	str	x4,[$ctx,#8]		// set is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) 	ldr	x29,[sp],#80
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) 	 .inst	0xd50323bf		// autiasp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) .size	poly1305_blocks_neon,.-poly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) .Lzeros:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) .long	0,0,0,0,0,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) .asciz	"Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) .align	2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) #if !defined(__KERNEL__) && !defined(_WIN64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) .comm	OPENSSL_armcap_P,4,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) .hidden	OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) foreach (split("\n",$code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) 	s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/			or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) 	s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/	or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) 	(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))			or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) 	(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))	or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) 	(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))		or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) 	(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))		or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) 	(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) 	s/\.[124]([sd])\[/.$1\[/;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) 	s/w#x([0-9]+)/w$1/g;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) 	print $_,"\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) close STDOUT;