Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    1) #!/usr/bin/env perl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    2) # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    4) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    5) # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    6) # project.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    7) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    8) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    9) #			IALU(*)/gcc-4.4		NEON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   10) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   11) # ARM11xx(ARMv6)	7.78/+100%		-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   12) # Cortex-A5		6.35/+130%		3.00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   13) # Cortex-A8		6.25/+115%		2.36
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   14) # Cortex-A9		5.10/+95%		2.55
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   15) # Cortex-A15		3.85/+85%		1.25(**)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   16) # Snapdragon S4		5.70/+100%		1.48(**)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   17) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   18) # (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   19) # (**)	these are trade-off results, they can be improved by ~8% but at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   20) #	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   21) #	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   22) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   23) $flavour = shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   24) if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   25) else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   26) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   27) if ($flavour && $flavour ne "void") {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   28)     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   29)     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   30)     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   31)     die "can't locate arm-xlate.pl";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   32) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   33)     open STDOUT,"| \"$^X\" $xlate $flavour $output";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   34) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   35)     open STDOUT,">$output";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   36) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   37) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   38) ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   39) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   40) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   41) #ifndef	__KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   42) # include "arm_arch.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   43) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   44) # define __ARM_ARCH__ __LINUX_ARM_ARCH__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   45) # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   46) # define poly1305_init   poly1305_init_arm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   47) # define poly1305_blocks poly1305_blocks_arm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   48) # define poly1305_emit   poly1305_emit_arm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   49) .globl	poly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   50) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   51) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   52) #if defined(__thumb2__)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   53) .syntax	unified
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   54) .thumb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   55) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   56) .code	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   57) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   58) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   59) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   60) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   61) .globl	poly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   62) .globl	poly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   63) .globl	poly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   64) .type	poly1305_init,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   65) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   66) poly1305_init:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   67) .Lpoly1305_init:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   68) 	stmdb	sp!,{r4-r11}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   69) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   70) 	eor	r3,r3,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   71) 	cmp	$inp,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   72) 	str	r3,[$ctx,#0]		@ zero hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   73) 	str	r3,[$ctx,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   74) 	str	r3,[$ctx,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   75) 	str	r3,[$ctx,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   76) 	str	r3,[$ctx,#16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   77) 	str	r3,[$ctx,#36]		@ clear is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   78) 	add	$ctx,$ctx,#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   79) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   80) #ifdef	__thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   81) 	it	eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   82) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   83) 	moveq	r0,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   84) 	beq	.Lno_key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   85) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   86) #if	__ARM_MAX_ARCH__>=7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   87) 	mov	r3,#-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   88) 	str	r3,[$ctx,#28]		@ impossible key power value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   89) # ifndef __KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   90) 	adr	r11,.Lpoly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   91) 	ldr	r12,.LOPENSSL_armcap
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   92) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   93) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   94) 	ldrb	r4,[$inp,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   95) 	mov	r10,#0x0fffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   96) 	ldrb	r5,[$inp,#1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   97) 	and	r3,r10,#-4		@ 0x0ffffffc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   98) 	ldrb	r6,[$inp,#2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   99) 	ldrb	r7,[$inp,#3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  100) 	orr	r4,r4,r5,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  101) 	ldrb	r5,[$inp,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  102) 	orr	r4,r4,r6,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  103) 	ldrb	r6,[$inp,#5]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  104) 	orr	r4,r4,r7,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  105) 	ldrb	r7,[$inp,#6]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  106) 	and	r4,r4,r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  107) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  108) #if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  109) # if !defined(_WIN32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  110) 	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  111) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  112) # if defined(__APPLE__) || defined(_WIN32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  113) 	ldr	r12,[r12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  114) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  115) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  116) 	ldrb	r8,[$inp,#7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  117) 	orr	r5,r5,r6,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  118) 	ldrb	r6,[$inp,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  119) 	orr	r5,r5,r7,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  120) 	ldrb	r7,[$inp,#9]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  121) 	orr	r5,r5,r8,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  122) 	ldrb	r8,[$inp,#10]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  123) 	and	r5,r5,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  124) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  125) #if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  126) 	tst	r12,#ARMV7_NEON		@ check for NEON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  127) # ifdef	__thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  128) 	adr	r9,.Lpoly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  129) 	adr	r11,.Lpoly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  130) 	it	ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  131) 	movne	r11,r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  132) 	adr	r12,.Lpoly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  133) 	orr	r11,r11,#1		@ thumb-ify addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  134) 	orr	r12,r12,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  135) # else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  136) 	add	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  137) 	ite	eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  138) 	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  139) 	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  140) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  141) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  142) 	ldrb	r9,[$inp,#11]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  143) 	orr	r6,r6,r7,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  144) 	ldrb	r7,[$inp,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  145) 	orr	r6,r6,r8,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  146) 	ldrb	r8,[$inp,#13]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  147) 	orr	r6,r6,r9,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  148) 	ldrb	r9,[$inp,#14]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  149) 	and	r6,r6,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  150) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  151) 	ldrb	r10,[$inp,#15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  152) 	orr	r7,r7,r8,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  153) 	str	r4,[$ctx,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  154) 	orr	r7,r7,r9,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  155) 	str	r5,[$ctx,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  156) 	orr	r7,r7,r10,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  157) 	str	r6,[$ctx,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  158) 	and	r7,r7,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  159) 	str	r7,[$ctx,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  160) #if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  161) 	stmia	r2,{r11,r12}		@ fill functions table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  162) 	mov	r0,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  163) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  164) 	mov	r0,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  165) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  166) .Lno_key:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  167) 	ldmia	sp!,{r4-r11}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  168) #if	__ARM_ARCH__>=5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  169) 	ret				@ bx	lr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  170) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  171) 	tst	lr,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  172) 	moveq	pc,lr			@ be binary compatible with V4, yet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  173) 	bx	lr			@ interoperable with Thumb ISA:-)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  174) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  175) .size	poly1305_init,.-poly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  176) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  177) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  178) my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  179) my ($s1,$s2,$s3)=($r1,$r2,$r3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  180) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  181) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  182) .type	poly1305_blocks,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  183) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  184) poly1305_blocks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  185) .Lpoly1305_blocks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  186) 	stmdb	sp!,{r3-r11,lr}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  187) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  188) 	ands	$len,$len,#-16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  189) 	beq	.Lno_data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  190) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  191) 	add	$len,$len,$inp		@ end pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  192) 	sub	sp,sp,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  193) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  194) #if __ARM_ARCH__<7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  195) 	ldmia	$ctx,{$h0-$r3}		@ load context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  196) 	add	$ctx,$ctx,#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  197) 	str	$len,[sp,#16]		@ offload stuff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  198) 	str	$ctx,[sp,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  199) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  200) 	ldr	lr,[$ctx,#36]		@ is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  201) 	ldmia	$ctx!,{$h0-$h4}		@ load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  202) 	str	$len,[sp,#16]		@ offload stuff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  203) 	str	$ctx,[sp,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  204) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  205) 	adds	$r0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  206) 	mov	$r1,$h1,lsr#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  207) 	adcs	$r1,$r1,$h2,lsl#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  208) 	mov	$r2,$h2,lsr#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  209) 	adcs	$r2,$r2,$h3,lsl#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  210) 	mov	$r3,$h3,lsr#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  211) 	adcs	$r3,$r3,$h4,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  212) 	mov	$len,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  213) 	teq	lr,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  214) 	str	$len,[$ctx,#16]		@ clear is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  215) 	adc	$len,$len,$h4,lsr#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  216) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  217) 	itttt	ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  218) 	movne	$h0,$r0			@ choose between radixes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  219) 	movne	$h1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  220) 	movne	$h2,$r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  221) 	movne	$h3,$r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  222) 	ldmia	$ctx,{$r0-$r3}		@ load key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  223) 	it	ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  224) 	movne	$h4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  225) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  226) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  227) 	mov	lr,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  228) 	cmp	$padbit,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  229) 	str	$r1,[sp,#20]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  230) 	str	$r2,[sp,#24]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  231) 	str	$r3,[sp,#28]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  232) 	b	.Loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  233) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  234) .align	4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  235) .Loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  236) #if __ARM_ARCH__<7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  237) 	ldrb	r0,[lr],#16		@ load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  238) # ifdef	__thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  239) 	it	hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  240) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  241) 	addhi	$h4,$h4,#1		@ 1<<128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  242) 	ldrb	r1,[lr,#-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  243) 	ldrb	r2,[lr,#-14]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  244) 	ldrb	r3,[lr,#-13]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  245) 	orr	r1,r0,r1,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  246) 	ldrb	r0,[lr,#-12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  247) 	orr	r2,r1,r2,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  248) 	ldrb	r1,[lr,#-11]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  249) 	orr	r3,r2,r3,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  250) 	ldrb	r2,[lr,#-10]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  251) 	adds	$h0,$h0,r3		@ accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  252) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  253) 	ldrb	r3,[lr,#-9]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  254) 	orr	r1,r0,r1,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  255) 	ldrb	r0,[lr,#-8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  256) 	orr	r2,r1,r2,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  257) 	ldrb	r1,[lr,#-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  258) 	orr	r3,r2,r3,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  259) 	ldrb	r2,[lr,#-6]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  260) 	adcs	$h1,$h1,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  261) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  262) 	ldrb	r3,[lr,#-5]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  263) 	orr	r1,r0,r1,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  264) 	ldrb	r0,[lr,#-4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  265) 	orr	r2,r1,r2,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  266) 	ldrb	r1,[lr,#-3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  267) 	orr	r3,r2,r3,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  268) 	ldrb	r2,[lr,#-2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  269) 	adcs	$h2,$h2,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  270) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  271) 	ldrb	r3,[lr,#-1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  272) 	orr	r1,r0,r1,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  273) 	str	lr,[sp,#8]		@ offload input pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  274) 	orr	r2,r1,r2,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  275) 	add	$s1,$r1,$r1,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  276) 	orr	r3,r2,r3,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  277) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  278) 	ldr	r0,[lr],#16		@ load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  279) 	it	hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  280) 	addhi	$h4,$h4,#1		@ padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  281) 	ldr	r1,[lr,#-12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  282) 	ldr	r2,[lr,#-8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  283) 	ldr	r3,[lr,#-4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  284) # ifdef	__ARMEB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  285) 	rev	r0,r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  286) 	rev	r1,r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  287) 	rev	r2,r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  288) 	rev	r3,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  289) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  290) 	adds	$h0,$h0,r0		@ accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  291) 	str	lr,[sp,#8]		@ offload input pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  292) 	adcs	$h1,$h1,r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  293) 	add	$s1,$r1,$r1,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  294) 	adcs	$h2,$h2,r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  295) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  296) 	add	$s2,$r2,$r2,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  297) 	adcs	$h3,$h3,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  298) 	add	$s3,$r3,$r3,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  299) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  300) 	umull	r2,r3,$h1,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  301) 	 adc	$h4,$h4,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  302) 	umull	r0,r1,$h0,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  303) 	umlal	r2,r3,$h4,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  304) 	umlal	r0,r1,$h3,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  305) 	ldr	$r1,[sp,#20]		@ reload $r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  306) 	umlal	r2,r3,$h2,$s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  307) 	umlal	r0,r1,$h1,$s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  308) 	umlal	r2,r3,$h3,$s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  309) 	umlal	r0,r1,$h2,$s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  310) 	umlal	r2,r3,$h0,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  311) 	str	r0,[sp,#0]		@ future $h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  312) 	 mul	r0,$s2,$h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  313) 	ldr	$r2,[sp,#24]		@ reload $r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  314) 	adds	r2,r2,r1		@ d1+=d0>>32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  315) 	 eor	r1,r1,r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  316) 	adc	lr,r3,#0		@ future $h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  317) 	str	r2,[sp,#4]		@ future $h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  318) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  319) 	mul	r2,$s3,$h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  320) 	eor	r3,r3,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  321) 	umlal	r0,r1,$h3,$s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  322) 	ldr	$r3,[sp,#28]		@ reload $r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  323) 	umlal	r2,r3,$h3,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  324) 	umlal	r0,r1,$h2,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  325) 	umlal	r2,r3,$h2,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  326) 	umlal	r0,r1,$h1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  327) 	umlal	r2,r3,$h1,$r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  328) 	umlal	r0,r1,$h0,$r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  329) 	umlal	r2,r3,$h0,$r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  330) 	ldr	$h0,[sp,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  331) 	mul	$h4,$r0,$h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  332) 	ldr	$h1,[sp,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  333) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  334) 	adds	$h2,lr,r0		@ d2+=d1>>32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  335) 	ldr	lr,[sp,#8]		@ reload input pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  336) 	adc	r1,r1,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  337) 	adds	$h3,r2,r1		@ d3+=d2>>32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  338) 	ldr	r0,[sp,#16]		@ reload end pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  339) 	adc	r3,r3,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  340) 	add	$h4,$h4,r3		@ h4+=d3>>32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  341) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  342) 	and	r1,$h4,#-4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  343) 	and	$h4,$h4,#3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  344) 	add	r1,r1,r1,lsr#2		@ *=5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  345) 	adds	$h0,$h0,r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  346) 	adcs	$h1,$h1,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  347) 	adcs	$h2,$h2,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  348) 	adcs	$h3,$h3,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  349) 	adc	$h4,$h4,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  350) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  351) 	cmp	r0,lr			@ done yet?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  352) 	bhi	.Loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  353) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  354) 	ldr	$ctx,[sp,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  355) 	add	sp,sp,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  356) 	stmdb	$ctx,{$h0-$h4}		@ store the result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  357) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  358) .Lno_data:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  359) #if	__ARM_ARCH__>=5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  360) 	ldmia	sp!,{r3-r11,pc}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  361) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  362) 	ldmia	sp!,{r3-r11,lr}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  363) 	tst	lr,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  364) 	moveq	pc,lr			@ be binary compatible with V4, yet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  365) 	bx	lr			@ interoperable with Thumb ISA:-)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  366) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  367) .size	poly1305_blocks,.-poly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  368) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  370) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  371) my ($ctx,$mac,$nonce)=map("r$_",(0..2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  372) my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  373) my $g4=$ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  374) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  375) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  376) .type	poly1305_emit,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  377) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  378) poly1305_emit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  379) .Lpoly1305_emit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  380) 	stmdb	sp!,{r4-r11}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  381) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  382) 	ldmia	$ctx,{$h0-$h4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  383) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  384) #if __ARM_ARCH__>=7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  385) 	ldr	ip,[$ctx,#36]		@ is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  386) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  387) 	adds	$g0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  388) 	mov	$g1,$h1,lsr#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  389) 	adcs	$g1,$g1,$h2,lsl#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  390) 	mov	$g2,$h2,lsr#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  391) 	adcs	$g2,$g2,$h3,lsl#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  392) 	mov	$g3,$h3,lsr#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  393) 	adcs	$g3,$g3,$h4,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  394) 	mov	$g4,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  395) 	adc	$g4,$g4,$h4,lsr#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  396) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  397) 	tst	ip,ip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  398) 	itttt	ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  399) 	movne	$h0,$g0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  400) 	movne	$h1,$g1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  401) 	movne	$h2,$g2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  402) 	movne	$h3,$g3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  403) 	it	ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  404) 	movne	$h4,$g4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  405) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  406) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  407) 	adds	$g0,$h0,#5		@ compare to modulus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  408) 	adcs	$g1,$h1,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  409) 	adcs	$g2,$h2,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  410) 	adcs	$g3,$h3,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  411) 	adc	$g4,$h4,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  412) 	tst	$g4,#4			@ did it carry/borrow?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  413) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  414) #ifdef	__thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  415) 	it	ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  416) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  417) 	movne	$h0,$g0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  418) 	ldr	$g0,[$nonce,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  419) #ifdef	__thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  420) 	it	ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  421) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  422) 	movne	$h1,$g1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  423) 	ldr	$g1,[$nonce,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  424) #ifdef	__thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  425) 	it	ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  426) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  427) 	movne	$h2,$g2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  428) 	ldr	$g2,[$nonce,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  429) #ifdef	__thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  430) 	it	ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  431) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  432) 	movne	$h3,$g3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  433) 	ldr	$g3,[$nonce,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  434) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  435) 	adds	$h0,$h0,$g0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  436) 	adcs	$h1,$h1,$g1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  437) 	adcs	$h2,$h2,$g2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  438) 	adc	$h3,$h3,$g3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  439) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  440) #if __ARM_ARCH__>=7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  441) # ifdef __ARMEB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  442) 	rev	$h0,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  443) 	rev	$h1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  444) 	rev	$h2,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  445) 	rev	$h3,$h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  446) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  447) 	str	$h0,[$mac,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  448) 	str	$h1,[$mac,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  449) 	str	$h2,[$mac,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  450) 	str	$h3,[$mac,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  451) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  452) 	strb	$h0,[$mac,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  453) 	mov	$h0,$h0,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  454) 	strb	$h1,[$mac,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  455) 	mov	$h1,$h1,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  456) 	strb	$h2,[$mac,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  457) 	mov	$h2,$h2,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  458) 	strb	$h3,[$mac,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  459) 	mov	$h3,$h3,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  460) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  461) 	strb	$h0,[$mac,#1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  462) 	mov	$h0,$h0,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  463) 	strb	$h1,[$mac,#5]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  464) 	mov	$h1,$h1,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  465) 	strb	$h2,[$mac,#9]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  466) 	mov	$h2,$h2,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  467) 	strb	$h3,[$mac,#13]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  468) 	mov	$h3,$h3,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  469) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  470) 	strb	$h0,[$mac,#2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  471) 	mov	$h0,$h0,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  472) 	strb	$h1,[$mac,#6]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  473) 	mov	$h1,$h1,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  474) 	strb	$h2,[$mac,#10]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  475) 	mov	$h2,$h2,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  476) 	strb	$h3,[$mac,#14]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  477) 	mov	$h3,$h3,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  478) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  479) 	strb	$h0,[$mac,#3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  480) 	strb	$h1,[$mac,#7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  481) 	strb	$h2,[$mac,#11]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  482) 	strb	$h3,[$mac,#15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  483) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  484) 	ldmia	sp!,{r4-r11}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  485) #if	__ARM_ARCH__>=5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  486) 	ret				@ bx	lr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  487) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  488) 	tst	lr,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  489) 	moveq	pc,lr			@ be binary compatible with V4, yet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  490) 	bx	lr			@ interoperable with Thumb ISA:-)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  491) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  492) .size	poly1305_emit,.-poly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  493) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  494) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  495) my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  496) my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  497) my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  498) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  499) my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  500) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  501) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  502) #if	__ARM_MAX_ARCH__>=7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  503) .fpu	neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  504) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  505) .type	poly1305_init_neon,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  506) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  507) poly1305_init_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  508) .Lpoly1305_init_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  509) 	ldr	r3,[$ctx,#48]		@ first table element
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  510) 	cmp	r3,#-1			@ is value impossible?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  511) 	bne	.Lno_init_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  512) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  513) 	ldr	r4,[$ctx,#20]		@ load key base 2^32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  514) 	ldr	r5,[$ctx,#24]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  515) 	ldr	r6,[$ctx,#28]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  516) 	ldr	r7,[$ctx,#32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  517) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  518) 	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  519) 	mov	r3,r4,lsr#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  520) 	mov	r4,r5,lsr#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  521) 	orr	r3,r3,r5,lsl#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  522) 	mov	r5,r6,lsr#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  523) 	orr	r4,r4,r6,lsl#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  524) 	mov	r6,r7,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  525) 	orr	r5,r5,r7,lsl#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  526) 	and	r3,r3,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  527) 	and	r4,r4,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  528) 	and	r5,r5,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  529) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  530) 	vdup.32	$R0,r2			@ r^1 in both lanes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  531) 	add	r2,r3,r3,lsl#2		@ *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  532) 	vdup.32	$R1,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  533) 	add	r3,r4,r4,lsl#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  534) 	vdup.32	$S1,r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  535) 	vdup.32	$R2,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  536) 	add	r4,r5,r5,lsl#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  537) 	vdup.32	$S2,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  538) 	vdup.32	$R3,r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  539) 	add	r5,r6,r6,lsl#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  540) 	vdup.32	$S3,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  541) 	vdup.32	$R4,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  542) 	vdup.32	$S4,r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  543) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  544) 	mov	$zeros,#2		@ counter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  545) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  546) .Lsquare_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  547) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  548) 	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  549) 	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  550) 	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  551) 	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  552) 	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  553) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  554) 	vmull.u32	$D0,$R0,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  555) 	vmull.u32	$D1,$R1,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  556) 	vmull.u32	$D2,$R2,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  557) 	vmull.u32	$D3,$R3,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  558) 	vmull.u32	$D4,$R4,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  559) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  560) 	vmlal.u32	$D0,$R4,${S1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  561) 	vmlal.u32	$D1,$R0,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  562) 	vmlal.u32	$D2,$R1,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  563) 	vmlal.u32	$D3,$R2,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  564) 	vmlal.u32	$D4,$R3,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  565) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  566) 	vmlal.u32	$D0,$R3,${S2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  567) 	vmlal.u32	$D1,$R4,${S2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  568) 	vmlal.u32	$D3,$R1,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  569) 	vmlal.u32	$D2,$R0,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  570) 	vmlal.u32	$D4,$R2,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  571) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  572) 	vmlal.u32	$D0,$R2,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  573) 	vmlal.u32	$D3,$R0,${R3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  574) 	vmlal.u32	$D1,$R3,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  575) 	vmlal.u32	$D2,$R4,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  576) 	vmlal.u32	$D4,$R1,${R3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  577) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  578) 	vmlal.u32	$D3,$R4,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  579) 	vmlal.u32	$D0,$R1,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  580) 	vmlal.u32	$D1,$R2,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  581) 	vmlal.u32	$D2,$R3,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  582) 	vmlal.u32	$D4,$R0,${R4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  583) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  584) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  585) 	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  586) 	@ and P. Schwabe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  587) 	@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  588) 	@ H0>>+H1>>+H2>>+H3>>+H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  589) 	@ H3>>+H4>>*5+H0>>+H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  590) 	@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  591) 	@ Trivia.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  592) 	@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  593) 	@ Result of multiplication of n-bit number by m-bit number is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  594) 	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  595) 	@ m-bit number multiplied by 2^n is still n+m bits wide.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  596) 	@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  597) 	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  598) 	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  599) 	@ one is n+1 bits wide.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  600) 	@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  601) 	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  602) 	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  603) 	@ can be 27. However! In cases when their width exceeds 26 bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  604) 	@ they are limited by 2^26+2^6. This in turn means that *sum*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  605) 	@ of the products with these values can still be viewed as sum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  606) 	@ of 52-bit numbers as long as the amount of addends is not a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  607) 	@ power of 2. For example,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  608) 	@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  609) 	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  610) 	@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  611) 	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  612) 	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  613) 	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  614) 	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  615) 	@ which is less than 32 * (2^52) or 2^57. And when processing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  616) 	@ data we are looking at triple as many addends...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  617) 	@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  618) 	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  619) 	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  620) 	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  621) 	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  622) 	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  623) 	@ This means that result of reduction have to be compressed upon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  624) 	@ loop wrap-around. This can be done in the process of reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  625) 	@ to minimize amount of instructions [as well as amount of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  626) 	@ 128-bit instructions, which benefits low-end processors], but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  627) 	@ one has to watch for H2 (which is narrower than H0) and 5*H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  628) 	@ not being wider than 58 bits, so that result of right shift
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  629) 	@ by 26 bits fits in 32 bits. This is also useful on x86,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  630) 	@ because it allows to use paddd in place for paddq, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  631) 	@ benefits Atom, where paddq is ridiculously slow.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  632) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  633) 	vshr.u64	$T0,$D3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  634) 	vmovn.i64	$D3#lo,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  635) 	 vshr.u64	$T1,$D0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  636) 	 vmovn.i64	$D0#lo,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  637) 	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  638) 	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  639) 	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  640) 	 vbic.i32	$D0#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  641) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  642) 	vshrn.u64	$T0#lo,$D4,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  643) 	vmovn.i64	$D4#lo,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  644) 	 vshr.u64	$T1,$D1,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  645) 	 vmovn.i64	$D1#lo,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  646) 	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  647) 	vbic.i32	$D4#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  648) 	 vbic.i32	$D1#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  649) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  650) 	vadd.i32	$D0#lo,$D0#lo,$T0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  651) 	vshl.u32	$T0#lo,$T0#lo,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  652) 	 vshrn.u64	$T1#lo,$D2,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  653) 	 vmovn.i64	$D2#lo,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  654) 	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  655) 	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  656) 	 vbic.i32	$D2#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  657) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  658) 	vshr.u32	$T0#lo,$D0#lo,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  659) 	vbic.i32	$D0#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  660) 	 vshr.u32	$T1#lo,$D3#lo,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  661) 	 vbic.i32	$D3#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  662) 	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  663) 	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  664) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  665) 	subs		$zeros,$zeros,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  666) 	beq		.Lsquare_break_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  667) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  668) 	add		$tbl0,$ctx,#(48+0*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  669) 	add		$tbl1,$ctx,#(48+1*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  670) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  671) 	vtrn.32		$R0,$D0#lo		@ r^2:r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  672) 	vtrn.32		$R2,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  673) 	vtrn.32		$R3,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  674) 	vtrn.32		$R1,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  675) 	vtrn.32		$R4,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  676) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  677) 	vshl.u32	$S2,$R2,#2		@ *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  678) 	vshl.u32	$S3,$R3,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  679) 	vshl.u32	$S1,$R1,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  680) 	vshl.u32	$S4,$R4,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  681) 	vadd.i32	$S2,$S2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  682) 	vadd.i32	$S1,$S1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  683) 	vadd.i32	$S3,$S3,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  684) 	vadd.i32	$S4,$S4,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  685) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  686) 	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  687) 	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  688) 	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  689) 	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  690) 	vst1.32		{${S4}[0]},[$tbl0,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  691) 	vst1.32		{${S4}[1]},[$tbl1,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  692) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  693) 	b		.Lsquare_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  694) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  695) .align	4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  696) .Lsquare_break_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  697) 	add		$tbl0,$ctx,#(48+2*4*9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  698) 	add		$tbl1,$ctx,#(48+3*4*9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  699) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  700) 	vmov		$R0,$D0#lo		@ r^4:r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  701) 	vshl.u32	$S1,$D1#lo,#2		@ *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  702) 	vmov		$R1,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  703) 	vshl.u32	$S2,$D2#lo,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  704) 	vmov		$R2,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  705) 	vshl.u32	$S3,$D3#lo,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  706) 	vmov		$R3,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  707) 	vshl.u32	$S4,$D4#lo,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  708) 	vmov		$R4,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  709) 	vadd.i32	$S1,$S1,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  710) 	vadd.i32	$S2,$S2,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  711) 	vadd.i32	$S3,$S3,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  712) 	vadd.i32	$S4,$S4,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  713) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  714) 	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  715) 	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  716) 	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  717) 	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  718) 	vst1.32		{${S4}[0]},[$tbl0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  719) 	vst1.32		{${S4}[1]},[$tbl1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  720) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  721) .Lno_init_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  722) 	ret				@ bx	lr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  723) .size	poly1305_init_neon,.-poly1305_init_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  724) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  725) .type	poly1305_blocks_neon,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  726) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  727) poly1305_blocks_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  728) .Lpoly1305_blocks_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  729) 	ldr	ip,[$ctx,#36]		@ is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  730) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  731) 	cmp	$len,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  732) 	blo	.Lpoly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  733) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  734) 	stmdb	sp!,{r4-r7}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  735) 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  736) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  737) 	tst	ip,ip			@ is_base2_26?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  738) 	bne	.Lbase2_26_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  739) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  740) 	stmdb	sp!,{r1-r3,lr}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  741) 	bl	.Lpoly1305_init_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  742) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  743) 	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  744) 	ldr	r5,[$ctx,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  745) 	ldr	r6,[$ctx,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  746) 	ldr	r7,[$ctx,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  747) 	ldr	ip,[$ctx,#16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  748) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  749) 	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  750) 	mov	r3,r4,lsr#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  751) 	 veor	$D0#lo,$D0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  752) 	mov	r4,r5,lsr#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  753) 	orr	r3,r3,r5,lsl#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  754) 	 veor	$D1#lo,$D1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  755) 	mov	r5,r6,lsr#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  756) 	orr	r4,r4,r6,lsl#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  757) 	 veor	$D2#lo,$D2#lo,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  758) 	mov	r6,r7,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  759) 	orr	r5,r5,r7,lsl#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  760) 	 veor	$D3#lo,$D3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  761) 	and	r3,r3,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  762) 	orr	r6,r6,ip,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  763) 	 veor	$D4#lo,$D4#lo,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  764) 	and	r4,r4,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  765) 	mov	r1,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  766) 	and	r5,r5,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  767) 	str	r1,[$ctx,#36]		@ set is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  768) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  769) 	vmov.32	$D0#lo[0],r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  770) 	vmov.32	$D1#lo[0],r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  771) 	vmov.32	$D2#lo[0],r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  772) 	vmov.32	$D3#lo[0],r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  773) 	vmov.32	$D4#lo[0],r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  774) 	adr	$zeros,.Lzeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  775) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  776) 	ldmia	sp!,{r1-r3,lr}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  777) 	b	.Lhash_loaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  778) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  779) .align	4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  780) .Lbase2_26_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  781) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  782) 	@ load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  783) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  784) 	veor		$D0#lo,$D0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  785) 	veor		$D1#lo,$D1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  786) 	veor		$D2#lo,$D2#lo,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  787) 	veor		$D3#lo,$D3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  788) 	veor		$D4#lo,$D4#lo,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  789) 	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  790) 	adr		$zeros,.Lzeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  791) 	vld1.32		{$D4#lo[0]},[$ctx]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  792) 	sub		$ctx,$ctx,#16		@ rewind
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  793) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  794) .Lhash_loaded:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  795) 	add		$in2,$inp,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  796) 	mov		$padbit,$padbit,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  797) 	tst		$len,#31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  798) 	beq		.Leven
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  799) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  800) 	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  801) 	vmov.32		$H4#lo[0],$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  802) 	sub		$len,$len,#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  803) 	add		$in2,$inp,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  804) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  805) # ifdef	__ARMEB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  806) 	vrev32.8	$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  807) 	vrev32.8	$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  808) 	vrev32.8	$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  809) 	vrev32.8	$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  810) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  811) 	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  812) 	vshl.u32	$H3#lo,$H3#lo,#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  813) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  814) 	vsri.u32	$H3#lo,$H2#lo,#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  815) 	vshl.u32	$H2#lo,$H2#lo,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  816) 	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  817) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  818) 	vbic.i32	$H3#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  819) 	vsri.u32	$H2#lo,$H1#lo,#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  820) 	vshl.u32	$H1#lo,$H1#lo,#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  821) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  822) 	vbic.i32	$H2#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  823) 	vsri.u32	$H1#lo,$H0#lo,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  824) 	vadd.i32	$H3#hi,$H3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  825) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  826) 	vbic.i32	$H0#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  827) 	vbic.i32	$H1#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  828) 	vadd.i32	$H2#hi,$H2#lo,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  829) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  830) 	vadd.i32	$H0#hi,$H0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  831) 	vadd.i32	$H1#hi,$H1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  832) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  833) 	mov		$tbl1,$zeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  834) 	add		$tbl0,$ctx,#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  835) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  836) 	cmp		$len,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  837) 	b		.Long_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  838) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  839) .align	4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  840) .Leven:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  841) 	subs		$len,$len,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  842) 	it		lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  843) 	movlo		$in2,$zeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  844) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  845) 	vmov.i32	$H4,#1<<24		@ padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  846) 	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  847) 	add		$inp,$inp,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  848) 	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  849) 	add		$in2,$in2,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  850) 	itt		hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  851) 	addhi		$tbl1,$ctx,#(48+1*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  852) 	addhi		$tbl0,$ctx,#(48+3*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  853) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  854) # ifdef	__ARMEB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  855) 	vrev32.8	$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  856) 	vrev32.8	$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  857) 	vrev32.8	$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  858) 	vrev32.8	$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  859) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  860) 	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  861) 	vshl.u32	$H3,$H3,#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  862) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  863) 	vsri.u32	$H3,$H2,#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  864) 	vshl.u32	$H2,$H2,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  865) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  866) 	vbic.i32	$H3,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  867) 	vsri.u32	$H2,$H1,#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  868) 	vshl.u32	$H1,$H1,#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  869) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  870) 	vbic.i32	$H2,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  871) 	vsri.u32	$H1,$H0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  872) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  873) 	vbic.i32	$H0,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  874) 	vbic.i32	$H1,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  875) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  876) 	bls		.Lskip_loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  877) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  878) 	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  879) 	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  880) 	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  881) 	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  882) 	b		.Loop_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  883) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  884) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  885) .Loop_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  886) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  887) 	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  888) 	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  889) 	@   \___________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  890) 	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  891) 	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  892) 	@   \___________________/ \____________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  893) 	@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  894) 	@ Note that we start with inp[2:3]*r^2. This is because it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  895) 	@ doesn't depend on reduction in previous iteration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  896) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  897) 	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  898) 	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  899) 	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  900) 	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  901) 	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  902) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  903) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  904) 	@ inp[2:3]*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  905) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  906) 	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  907) 	vmull.u32	$D2,$H2#hi,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  908) 	vadd.i32	$H0#lo,$H0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  909) 	vmull.u32	$D0,$H0#hi,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  910) 	vadd.i32	$H3#lo,$H3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  911) 	vmull.u32	$D3,$H3#hi,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  912) 	vmlal.u32	$D2,$H1#hi,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  913) 	vadd.i32	$H1#lo,$H1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  914) 	vmull.u32	$D1,$H1#hi,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  915) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  916) 	vadd.i32	$H4#lo,$H4#lo,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  917) 	vmull.u32	$D4,$H4#hi,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  918) 	subs		$len,$len,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  919) 	vmlal.u32	$D0,$H4#hi,${S1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  920) 	it		lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  921) 	movlo		$in2,$zeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  922) 	vmlal.u32	$D3,$H2#hi,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  923) 	vld1.32		${S4}[1],[$tbl1,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  924) 	vmlal.u32	$D1,$H0#hi,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  925) 	vmlal.u32	$D4,$H3#hi,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  926) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  927) 	vmlal.u32	$D0,$H3#hi,${S2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  928) 	vmlal.u32	$D3,$H1#hi,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  929) 	vmlal.u32	$D4,$H2#hi,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  930) 	vmlal.u32	$D1,$H4#hi,${S2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  931) 	vmlal.u32	$D2,$H0#hi,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  932) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  933) 	vmlal.u32	$D3,$H0#hi,${R3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  934) 	vmlal.u32	$D0,$H2#hi,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  935) 	vmlal.u32	$D4,$H1#hi,${R3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  936) 	vmlal.u32	$D1,$H3#hi,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  937) 	vmlal.u32	$D2,$H4#hi,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  938) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  939) 	vmlal.u32	$D3,$H4#hi,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  940) 	vmlal.u32	$D0,$H1#hi,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  941) 	vmlal.u32	$D4,$H0#hi,${R4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  942) 	vmlal.u32	$D1,$H2#hi,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  943) 	vmlal.u32	$D2,$H3#hi,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  944) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  945) 	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  946) 	add		$in2,$in2,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  947) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  948) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  949) 	@ (hash+inp[0:1])*r^4 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  950) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  951) 	vmlal.u32	$D3,$H3#lo,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  952) 	vmlal.u32	$D0,$H0#lo,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  953) 	vmlal.u32	$D4,$H4#lo,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  954) 	vmlal.u32	$D1,$H1#lo,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  955) 	vmlal.u32	$D2,$H2#lo,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  956) 	vld1.32		${S4}[0],[$tbl0,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  957) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  958) 	vmlal.u32	$D3,$H2#lo,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  959) 	vmlal.u32	$D0,$H4#lo,${S1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  960) 	vmlal.u32	$D4,$H3#lo,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  961) 	vmlal.u32	$D1,$H0#lo,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  962) 	vmlal.u32	$D2,$H1#lo,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  963) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  964) 	vmlal.u32	$D3,$H1#lo,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  965) 	vmlal.u32	$D0,$H3#lo,${S2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  966) 	vmlal.u32	$D4,$H2#lo,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  967) 	vmlal.u32	$D1,$H4#lo,${S2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  968) 	vmlal.u32	$D2,$H0#lo,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  969) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  970) 	vmlal.u32	$D3,$H0#lo,${R3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  971) 	vmlal.u32	$D0,$H2#lo,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  972) 	vmlal.u32	$D4,$H1#lo,${R3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  973) 	vmlal.u32	$D1,$H3#lo,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  974) 	vmlal.u32	$D3,$H4#lo,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  975) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  976) 	vmlal.u32	$D2,$H4#lo,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  977) 	vmlal.u32	$D0,$H1#lo,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  978) 	vmlal.u32	$D4,$H0#lo,${R4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  979) 	vmov.i32	$H4,#1<<24		@ padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  980) 	vmlal.u32	$D1,$H2#lo,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  981) 	vmlal.u32	$D2,$H3#lo,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  982) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  983) 	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  984) 	add		$inp,$inp,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  985) # ifdef	__ARMEB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  986) 	vrev32.8	$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  987) 	vrev32.8	$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  988) 	vrev32.8	$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  989) 	vrev32.8	$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  990) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  991) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  992) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  993) 	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  994) 	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  995) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  996) 	vshr.u64	$T0,$D3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  997) 	vmovn.i64	$D3#lo,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  998) 	 vshr.u64	$T1,$D0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  999) 	 vmovn.i64	$D0#lo,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) 	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) 	vbic.i32	$D3#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) 	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) 	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) 	  vshl.u32	$H3,$H3,#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) 	 vbic.i32	$D0#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) 	vshrn.u64	$T0#lo,$D4,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) 	vmovn.i64	$D4#lo,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) 	 vshr.u64	$T1,$D1,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) 	 vmovn.i64	$D1#lo,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) 	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) 	  vsri.u32	$H3,$H2,#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) 	vbic.i32	$D4#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) 	  vshl.u32	$H2,$H2,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) 	 vbic.i32	$D1#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) 	vadd.i32	$D0#lo,$D0#lo,$T0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) 	vshl.u32	$T0#lo,$T0#lo,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) 	  vbic.i32	$H3,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) 	 vshrn.u64	$T1#lo,$D2,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) 	 vmovn.i64	$D2#lo,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) 	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) 	  vsri.u32	$H2,$H1,#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) 	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) 	  vshl.u32	$H1,$H1,#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) 	 vbic.i32	$D2#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) 	  vbic.i32	$H2,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) 	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) 	vmovn.i64	$D0#lo,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) 	  vsri.u32	$H1,$H0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) 	  vbic.i32	$H0,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) 	 vshr.u32	$T1#lo,$D3#lo,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) 	 vbic.i32	$D3#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) 	vbic.i32	$D0#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) 	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) 	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) 	  vbic.i32	$H1,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) 	bhi		.Loop_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) .Lskip_loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) 	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) 	add		$tbl1,$ctx,#(48+0*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) 	add		$tbl0,$ctx,#(48+1*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) 	adds		$len,$len,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) 	it		ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) 	movne		$len,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) 	bne		.Long_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) 	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) 	vadd.i32	$H0#hi,$H0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) 	vadd.i32	$H3#hi,$H3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) 	vadd.i32	$H1#hi,$H1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) 	vadd.i32	$H4#hi,$H4#lo,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) .Long_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) 	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) 	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) 	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) 	vmull.u32	$D2,$H2#hi,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) 	vadd.i32	$H0#lo,$H0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) 	vmull.u32	$D0,$H0#hi,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) 	vadd.i32	$H3#lo,$H3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) 	vmull.u32	$D3,$H3#hi,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) 	vadd.i32	$H1#lo,$H1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) 	vmull.u32	$D1,$H1#hi,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) 	vadd.i32	$H4#lo,$H4#lo,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) 	vmull.u32	$D4,$H4#hi,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) 	vmlal.u32	$D0,$H4#hi,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) 	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) 	vmlal.u32	$D3,$H2#hi,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) 	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) 	vmlal.u32	$D1,$H0#hi,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) 	vmlal.u32	$D4,$H3#hi,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) 	vmlal.u32	$D2,$H1#hi,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) 	vmlal.u32	$D3,$H1#hi,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) 	vld1.32		${S4}[1],[$tbl1,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) 	vmlal.u32	$D0,$H3#hi,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) 	vld1.32		${S4}[0],[$tbl0,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) 	vmlal.u32	$D4,$H2#hi,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) 	vmlal.u32	$D1,$H4#hi,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) 	vmlal.u32	$D2,$H0#hi,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) 	vmlal.u32	$D3,$H0#hi,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) 	 it		ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) 	 addne		$tbl1,$ctx,#(48+2*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) 	vmlal.u32	$D0,$H2#hi,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) 	 it		ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) 	 addne		$tbl0,$ctx,#(48+3*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) 	vmlal.u32	$D4,$H1#hi,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) 	vmlal.u32	$D1,$H3#hi,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) 	vmlal.u32	$D2,$H4#hi,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) 	vmlal.u32	$D3,$H4#hi,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) 	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) 	vmlal.u32	$D0,$H1#hi,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) 	 vshr.u64	$MASK,$MASK,#38
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) 	vmlal.u32	$D4,$H0#hi,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) 	vmlal.u32	$D1,$H2#hi,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) 	vmlal.u32	$D2,$H3#hi,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) 	beq		.Lshort_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) 	@ (hash+inp[0:1])*r^4:r^3 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) 	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) 	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) 	vmlal.u32	$D2,$H2#lo,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) 	vmlal.u32	$D0,$H0#lo,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) 	vmlal.u32	$D3,$H3#lo,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) 	vmlal.u32	$D1,$H1#lo,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) 	vmlal.u32	$D4,$H4#lo,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) 	vmlal.u32	$D0,$H4#lo,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) 	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) 	vmlal.u32	$D3,$H2#lo,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) 	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) 	vmlal.u32	$D1,$H0#lo,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) 	vmlal.u32	$D4,$H3#lo,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) 	vmlal.u32	$D2,$H1#lo,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) 	vmlal.u32	$D3,$H1#lo,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) 	vld1.32		${S4}[1],[$tbl1,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) 	vmlal.u32	$D0,$H3#lo,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) 	vld1.32		${S4}[0],[$tbl0,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) 	vmlal.u32	$D4,$H2#lo,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) 	vmlal.u32	$D1,$H4#lo,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) 	vmlal.u32	$D2,$H0#lo,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) 	vmlal.u32	$D3,$H0#lo,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) 	vmlal.u32	$D0,$H2#lo,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) 	vmlal.u32	$D4,$H1#lo,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) 	vmlal.u32	$D1,$H3#lo,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) 	vmlal.u32	$D2,$H4#lo,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) 	vmlal.u32	$D3,$H4#lo,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) 	 vorn		$MASK,$MASK,$MASK	@ all-ones
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) 	vmlal.u32	$D0,$H1#lo,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) 	 vshr.u64	$MASK,$MASK,#38
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) 	vmlal.u32	$D4,$H0#lo,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) 	vmlal.u32	$D1,$H2#lo,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) 	vmlal.u32	$D2,$H3#lo,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) .Lshort_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) 	@ horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) 	vadd.i64	$D3#lo,$D3#lo,$D3#hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) 	vadd.i64	$D0#lo,$D0#lo,$D0#hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) 	vadd.i64	$D4#lo,$D4#lo,$D4#hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) 	vadd.i64	$D1#lo,$D1#lo,$D1#hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) 	vadd.i64	$D2#lo,$D2#lo,$D2#hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) 	@ lazy reduction, but without narrowing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) 	vshr.u64	$T0,$D3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) 	vand.i64	$D3,$D3,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) 	 vshr.u64	$T1,$D0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) 	 vand.i64	$D0,$D0,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) 	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) 	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) 	vshr.u64	$T0,$D4,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) 	vand.i64	$D4,$D4,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) 	 vshr.u64	$T1,$D1,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) 	 vand.i64	$D1,$D1,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) 	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) 	vadd.i64	$D0,$D0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) 	vshl.u64	$T0,$T0,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) 	 vshr.u64	$T1,$D2,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) 	 vand.i64	$D2,$D2,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) 	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) 	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) 	vshr.u64	$T0,$D0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) 	vand.i64	$D0,$D0,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) 	 vshr.u64	$T1,$D3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) 	 vand.i64	$D3,$D3,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) 	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) 	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) 	cmp		$len,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) 	bne		.Leven
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) 	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) 	@ store hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) 	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) 	vst1.32		{$D4#lo[0]},[$ctx]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) 	vldmia	sp!,{d8-d15}			@ epilogue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) 	ldmia	sp!,{r4-r7}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) 	ret					@ bx	lr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) .size	poly1305_blocks_neon,.-poly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) .align	5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) .Lzeros:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) .long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) #ifndef	__KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) .LOPENSSL_armcap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) # ifdef	_WIN32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) .word	OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) # else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) .word	OPENSSL_armcap_P-.Lpoly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) .comm	OPENSSL_armcap_P,4,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) .hidden	OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) }	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) .asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) .align	2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) foreach (split("\n",$code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) 	s/\`([^\`]*)\`/eval $1/geo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) 	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) 	s/\bret\b/bx	lr/go						or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) 	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) 	print $_,"\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) close STDOUT; # enforce flush