^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) #!/usr/bin/env perl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) # project.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) # IALU(*)/gcc-4.4 NEON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) # ARM11xx(ARMv6) 7.78/+100% -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) # Cortex-A5 6.35/+130% 3.00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) # Cortex-A8 6.25/+115% 2.36
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) # Cortex-A9 5.10/+95% 2.55
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) # Cortex-A15 3.85/+85% 1.25(**)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) # Snapdragon S4 5.70/+100% 1.48(**)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) # (**) these are trade-off results, they can be improved by ~8% but at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) # the cost of 15/12% regression on Cortex-A5/A7, it's even possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) # to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) $flavour = shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) if ($flavour && $flavour ne "void") {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) die "can't locate arm-xlate.pl";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) open STDOUT,"| \"$^X\" $xlate $flavour $output";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) open STDOUT,">$output";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #ifndef __KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) # include "arm_arch.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) # define __ARM_ARCH__ __LINUX_ARM_ARCH__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) # define poly1305_init poly1305_init_arm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) # define poly1305_blocks poly1305_blocks_arm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) # define poly1305_emit poly1305_emit_arm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) .globl poly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #if defined(__thumb2__)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) .syntax unified
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) .thumb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) .code 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) .globl poly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) .globl poly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) .globl poly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) .type poly1305_init,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) poly1305_init:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) .Lpoly1305_init:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) stmdb sp!,{r4-r11}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) eor r3,r3,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) cmp $inp,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) str r3,[$ctx,#0] @ zero hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) str r3,[$ctx,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) str r3,[$ctx,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) str r3,[$ctx,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) str r3,[$ctx,#16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) str r3,[$ctx,#36] @ clear is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) add $ctx,$ctx,#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) #ifdef __thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) it eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) moveq r0,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) beq .Lno_key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) #if __ARM_MAX_ARCH__>=7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) mov r3,#-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) str r3,[$ctx,#28] @ impossible key power value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) # ifndef __KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) adr r11,.Lpoly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) ldr r12,.LOPENSSL_armcap
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) ldrb r4,[$inp,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) mov r10,#0x0fffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) ldrb r5,[$inp,#1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) and r3,r10,#-4 @ 0x0ffffffc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) ldrb r6,[$inp,#2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) ldrb r7,[$inp,#3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) orr r4,r4,r5,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) ldrb r5,[$inp,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) orr r4,r4,r6,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) ldrb r6,[$inp,#5]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) orr r4,r4,r7,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) ldrb r7,[$inp,#6]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) and r4,r4,r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) # if !defined(_WIN32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) ldr r12,[r11,r12] @ OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) # if defined(__APPLE__) || defined(_WIN32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) ldr r12,[r12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) ldrb r8,[$inp,#7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) orr r5,r5,r6,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) ldrb r6,[$inp,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) orr r5,r5,r7,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) ldrb r7,[$inp,#9]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) orr r5,r5,r8,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) ldrb r8,[$inp,#10]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) and r5,r5,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) tst r12,#ARMV7_NEON @ check for NEON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) # ifdef __thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) adr r9,.Lpoly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) adr r11,.Lpoly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) it ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) movne r11,r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) adr r12,.Lpoly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) orr r11,r11,#1 @ thumb-ify addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) orr r12,r12,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) # else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) ite eq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) ldrb r9,[$inp,#11]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) orr r6,r6,r7,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) ldrb r7,[$inp,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) orr r6,r6,r8,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) ldrb r8,[$inp,#13]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) orr r6,r6,r9,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) ldrb r9,[$inp,#14]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) and r6,r6,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) ldrb r10,[$inp,#15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) orr r7,r7,r8,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) str r4,[$ctx,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) orr r7,r7,r9,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) str r5,[$ctx,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) orr r7,r7,r10,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) str r6,[$ctx,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) and r7,r7,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) str r7,[$ctx,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) stmia r2,{r11,r12} @ fill functions table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) mov r0,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) mov r0,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) .Lno_key:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) ldmia sp!,{r4-r11}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) #if __ARM_ARCH__>=5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) ret @ bx lr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) tst lr,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) moveq pc,lr @ be binary compatible with V4, yet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) bx lr @ interoperable with Thumb ISA:-)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) .size poly1305_init,.-poly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) my ($s1,$s2,$s3)=($r1,$r2,$r3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) .type poly1305_blocks,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) poly1305_blocks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) .Lpoly1305_blocks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) stmdb sp!,{r3-r11,lr}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) ands $len,$len,#-16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) beq .Lno_data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) add $len,$len,$inp @ end pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) sub sp,sp,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) #if __ARM_ARCH__<7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) ldmia $ctx,{$h0-$r3} @ load context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) add $ctx,$ctx,#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) str $len,[sp,#16] @ offload stuff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) str $ctx,[sp,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) ldr lr,[$ctx,#36] @ is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) ldmia $ctx!,{$h0-$h4} @ load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) str $len,[sp,#16] @ offload stuff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) str $ctx,[sp,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) mov $r1,$h1,lsr#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) adcs $r1,$r1,$h2,lsl#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) mov $r2,$h2,lsr#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) adcs $r2,$r2,$h3,lsl#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) mov $r3,$h3,lsr#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) adcs $r3,$r3,$h4,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) mov $len,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) teq lr,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) str $len,[$ctx,#16] @ clear is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) adc $len,$len,$h4,lsr#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) itttt ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) movne $h0,$r0 @ choose between radixes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) movne $h1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) movne $h2,$r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) movne $h3,$r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) ldmia $ctx,{$r0-$r3} @ load key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) it ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) movne $h4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) mov lr,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) cmp $padbit,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) str $r1,[sp,#20]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) str $r2,[sp,#24]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) str $r3,[sp,#28]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) b .Loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) .Loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) #if __ARM_ARCH__<7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) ldrb r0,[lr],#16 @ load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) # ifdef __thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) it hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) addhi $h4,$h4,#1 @ 1<<128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) ldrb r1,[lr,#-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) ldrb r2,[lr,#-14]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) ldrb r3,[lr,#-13]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) orr r1,r0,r1,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) ldrb r0,[lr,#-12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) orr r2,r1,r2,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) ldrb r1,[lr,#-11]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) orr r3,r2,r3,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) ldrb r2,[lr,#-10]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) adds $h0,$h0,r3 @ accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) ldrb r3,[lr,#-9]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) orr r1,r0,r1,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) ldrb r0,[lr,#-8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) orr r2,r1,r2,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) ldrb r1,[lr,#-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) orr r3,r2,r3,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) ldrb r2,[lr,#-6]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) adcs $h1,$h1,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) ldrb r3,[lr,#-5]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) orr r1,r0,r1,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) ldrb r0,[lr,#-4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) orr r2,r1,r2,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) ldrb r1,[lr,#-3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) orr r3,r2,r3,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) ldrb r2,[lr,#-2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) adcs $h2,$h2,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) ldrb r3,[lr,#-1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) orr r1,r0,r1,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) str lr,[sp,#8] @ offload input pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) orr r2,r1,r2,lsl#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) add $s1,$r1,$r1,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) orr r3,r2,r3,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) ldr r0,[lr],#16 @ load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) it hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) addhi $h4,$h4,#1 @ padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) ldr r1,[lr,#-12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) ldr r2,[lr,#-8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) ldr r3,[lr,#-4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) # ifdef __ARMEB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) rev r0,r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) rev r1,r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) rev r2,r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) rev r3,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) adds $h0,$h0,r0 @ accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) str lr,[sp,#8] @ offload input pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) adcs $h1,$h1,r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) add $s1,$r1,$r1,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) adcs $h2,$h2,r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) add $s2,$r2,$r2,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) adcs $h3,$h3,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) add $s3,$r3,$r3,lsr#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) umull r2,r3,$h1,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) adc $h4,$h4,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) umull r0,r1,$h0,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) umlal r2,r3,$h4,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) umlal r0,r1,$h3,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) ldr $r1,[sp,#20] @ reload $r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) umlal r2,r3,$h2,$s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) umlal r0,r1,$h1,$s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) umlal r2,r3,$h3,$s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) umlal r0,r1,$h2,$s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) umlal r2,r3,$h0,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) str r0,[sp,#0] @ future $h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) mul r0,$s2,$h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) ldr $r2,[sp,#24] @ reload $r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) adds r2,r2,r1 @ d1+=d0>>32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) eor r1,r1,r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) adc lr,r3,#0 @ future $h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) str r2,[sp,#4] @ future $h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) mul r2,$s3,$h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) eor r3,r3,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) umlal r0,r1,$h3,$s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) ldr $r3,[sp,#28] @ reload $r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) umlal r2,r3,$h3,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) umlal r0,r1,$h2,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) umlal r2,r3,$h2,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) umlal r0,r1,$h1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) umlal r2,r3,$h1,$r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) umlal r0,r1,$h0,$r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) umlal r2,r3,$h0,$r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) ldr $h0,[sp,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) mul $h4,$r0,$h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) ldr $h1,[sp,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) adds $h2,lr,r0 @ d2+=d1>>32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) ldr lr,[sp,#8] @ reload input pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) adc r1,r1,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) adds $h3,r2,r1 @ d3+=d2>>32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) ldr r0,[sp,#16] @ reload end pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) adc r3,r3,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) add $h4,$h4,r3 @ h4+=d3>>32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) and r1,$h4,#-4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) and $h4,$h4,#3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) add r1,r1,r1,lsr#2 @ *=5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) adds $h0,$h0,r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) adcs $h1,$h1,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) adcs $h2,$h2,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) adcs $h3,$h3,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) adc $h4,$h4,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) cmp r0,lr @ done yet?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) bhi .Loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) ldr $ctx,[sp,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) add sp,sp,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) stmdb $ctx,{$h0-$h4} @ store the result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) .Lno_data:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) #if __ARM_ARCH__>=5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) ldmia sp!,{r3-r11,pc}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) ldmia sp!,{r3-r11,lr}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) tst lr,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) moveq pc,lr @ be binary compatible with V4, yet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) bx lr @ interoperable with Thumb ISA:-)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) .size poly1305_blocks,.-poly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) my ($ctx,$mac,$nonce)=map("r$_",(0..2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) my $g4=$ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) .type poly1305_emit,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) poly1305_emit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) .Lpoly1305_emit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) stmdb sp!,{r4-r11}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) ldmia $ctx,{$h0-$h4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) #if __ARM_ARCH__>=7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) ldr ip,[$ctx,#36] @ is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) mov $g1,$h1,lsr#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) adcs $g1,$g1,$h2,lsl#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) mov $g2,$h2,lsr#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) adcs $g2,$g2,$h3,lsl#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) mov $g3,$h3,lsr#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) adcs $g3,$g3,$h4,lsl#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) mov $g4,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) adc $g4,$g4,$h4,lsr#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) tst ip,ip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) itttt ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) movne $h0,$g0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) movne $h1,$g1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) movne $h2,$g2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) movne $h3,$g3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) it ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) movne $h4,$g4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) adds $g0,$h0,#5 @ compare to modulus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) adcs $g1,$h1,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) adcs $g2,$h2,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) adcs $g3,$h3,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) adc $g4,$h4,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) tst $g4,#4 @ did it carry/borrow?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) #ifdef __thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) it ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) movne $h0,$g0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) ldr $g0,[$nonce,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) #ifdef __thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) it ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) movne $h1,$g1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) ldr $g1,[$nonce,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) #ifdef __thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) it ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) movne $h2,$g2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) ldr $g2,[$nonce,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) #ifdef __thumb2__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) it ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) movne $h3,$g3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) ldr $g3,[$nonce,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) adds $h0,$h0,$g0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) adcs $h1,$h1,$g1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) adcs $h2,$h2,$g2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) adc $h3,$h3,$g3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) #if __ARM_ARCH__>=7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) # ifdef __ARMEB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) rev $h0,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) rev $h1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) rev $h2,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) rev $h3,$h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) str $h0,[$mac,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) str $h1,[$mac,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) str $h2,[$mac,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) str $h3,[$mac,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) strb $h0,[$mac,#0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) mov $h0,$h0,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) strb $h1,[$mac,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) mov $h1,$h1,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) strb $h2,[$mac,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) mov $h2,$h2,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) strb $h3,[$mac,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) mov $h3,$h3,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) strb $h0,[$mac,#1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) mov $h0,$h0,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) strb $h1,[$mac,#5]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) mov $h1,$h1,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) strb $h2,[$mac,#9]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) mov $h2,$h2,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) strb $h3,[$mac,#13]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) mov $h3,$h3,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) strb $h0,[$mac,#2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) mov $h0,$h0,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) strb $h1,[$mac,#6]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) mov $h1,$h1,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) strb $h2,[$mac,#10]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) mov $h2,$h2,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) strb $h3,[$mac,#14]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) mov $h3,$h3,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) strb $h0,[$mac,#3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) strb $h1,[$mac,#7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) strb $h2,[$mac,#11]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) strb $h3,[$mac,#15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) ldmia sp!,{r4-r11}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) #if __ARM_ARCH__>=5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) ret @ bx lr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) tst lr,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) moveq pc,lr @ be binary compatible with V4, yet
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) bx lr @ interoperable with Thumb ISA:-)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) .size poly1305_emit,.-poly1305_emit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) #if __ARM_MAX_ARCH__>=7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) .fpu neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) .type poly1305_init_neon,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) poly1305_init_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) .Lpoly1305_init_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) ldr r3,[$ctx,#48] @ first table element
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) cmp r3,#-1 @ is value impossible?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) bne .Lno_init_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) ldr r4,[$ctx,#20] @ load key base 2^32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) ldr r5,[$ctx,#24]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) ldr r6,[$ctx,#28]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) ldr r7,[$ctx,#32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) mov r3,r4,lsr#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) mov r4,r5,lsr#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) orr r3,r3,r5,lsl#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) mov r5,r6,lsr#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) orr r4,r4,r6,lsl#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) mov r6,r7,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) orr r5,r5,r7,lsl#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) and r3,r3,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) and r4,r4,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) and r5,r5,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) vdup.32 $R0,r2 @ r^1 in both lanes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) add r2,r3,r3,lsl#2 @ *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) vdup.32 $R1,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) add r3,r4,r4,lsl#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) vdup.32 $S1,r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) vdup.32 $R2,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) add r4,r5,r5,lsl#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) vdup.32 $S2,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) vdup.32 $R3,r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) add r5,r6,r6,lsl#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) vdup.32 $S3,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) vdup.32 $R4,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) vdup.32 $S4,r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) mov $zeros,#2 @ counter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) .Lsquare_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) vmull.u32 $D0,$R0,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) vmull.u32 $D1,$R1,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) vmull.u32 $D2,$R2,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) vmull.u32 $D3,$R3,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) vmull.u32 $D4,$R4,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) vmlal.u32 $D0,$R4,${S1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) vmlal.u32 $D1,$R0,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) vmlal.u32 $D2,$R1,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) vmlal.u32 $D3,$R2,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) vmlal.u32 $D4,$R3,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) vmlal.u32 $D0,$R3,${S2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) vmlal.u32 $D1,$R4,${S2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) vmlal.u32 $D3,$R1,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) vmlal.u32 $D2,$R0,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) vmlal.u32 $D4,$R2,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) vmlal.u32 $D0,$R2,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) vmlal.u32 $D3,$R0,${R3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) vmlal.u32 $D1,$R3,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) vmlal.u32 $D2,$R4,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) vmlal.u32 $D4,$R1,${R3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) vmlal.u32 $D3,$R4,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) vmlal.u32 $D0,$R1,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) vmlal.u32 $D1,$R2,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) vmlal.u32 $D2,$R3,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) vmlal.u32 $D4,$R0,${R4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) @ and P. Schwabe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) @ H0>>+H1>>+H2>>+H3>>+H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) @ H3>>+H4>>*5+H0>>+H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) @ Trivia.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) @ Result of multiplication of n-bit number by m-bit number is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) @ m-bit number multiplied by 2^n is still n+m bits wide.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) @ one is n+1 bits wide.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) @ can be 27. However! In cases when their width exceeds 26 bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) @ they are limited by 2^26+2^6. This in turn means that *sum*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) @ of the products with these values can still be viewed as sum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) @ of 52-bit numbers as long as the amount of addends is not a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) @ power of 2. For example,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) @ 8 * (2^52) or 2^55. However, the value is then multiplied by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) @ which is less than 32 * (2^52) or 2^57. And when processing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) @ data we are looking at triple as many addends...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) @ instruction accepts 2x32-bit input and writes 2x64-bit result.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) @ This means that result of reduction have to be compressed upon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) @ loop wrap-around. This can be done in the process of reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) @ to minimize amount of instructions [as well as amount of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) @ 128-bit instructions, which benefits low-end processors], but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) @ one has to watch for H2 (which is narrower than H0) and 5*H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) @ not being wider than 58 bits, so that result of right shift
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) @ by 26 bits fits in 32 bits. This is also useful on x86,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) @ because it allows to use paddd in place for paddq, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) @ benefits Atom, where paddq is ridiculously slow.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) vshr.u64 $T0,$D3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) vmovn.i64 $D3#lo,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) vshr.u64 $T1,$D0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) vmovn.i64 $D0#lo,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) vadd.i64 $D4,$D4,$T0 @ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) vadd.i64 $D1,$D1,$T1 @ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) vbic.i32 $D0#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) vshrn.u64 $T0#lo,$D4,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) vmovn.i64 $D4#lo,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) vshr.u64 $T1,$D1,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) vmovn.i64 $D1#lo,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) vadd.i64 $D2,$D2,$T1 @ h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) vbic.i32 $D4#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) vbic.i32 $D1#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) vadd.i32 $D0#lo,$D0#lo,$T0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) vshl.u32 $T0#lo,$T0#lo,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) vshrn.u64 $T1#lo,$D2,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) vmovn.i64 $D2#lo,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) vbic.i32 $D2#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) vshr.u32 $T0#lo,$D0#lo,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) vbic.i32 $D0#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) vshr.u32 $T1#lo,$D3#lo,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) vbic.i32 $D3#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) subs $zeros,$zeros,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) beq .Lsquare_break_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) add $tbl0,$ctx,#(48+0*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) add $tbl1,$ctx,#(48+1*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) vtrn.32 $R0,$D0#lo @ r^2:r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) vtrn.32 $R2,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) vtrn.32 $R3,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) vtrn.32 $R1,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) vtrn.32 $R4,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) vshl.u32 $S2,$R2,#2 @ *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) vshl.u32 $S3,$R3,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) vshl.u32 $S1,$R1,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) vshl.u32 $S4,$R4,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) vadd.i32 $S2,$S2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) vadd.i32 $S1,$S1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) vadd.i32 $S3,$S3,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) vadd.i32 $S4,$S4,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) vst1.32 {${S4}[0]},[$tbl0,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) vst1.32 {${S4}[1]},[$tbl1,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) b .Lsquare_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) .Lsquare_break_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) add $tbl0,$ctx,#(48+2*4*9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) add $tbl1,$ctx,#(48+3*4*9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) vmov $R0,$D0#lo @ r^4:r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) vshl.u32 $S1,$D1#lo,#2 @ *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) vmov $R1,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) vshl.u32 $S2,$D2#lo,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) vmov $R2,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) vshl.u32 $S3,$D3#lo,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) vmov $R3,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) vshl.u32 $S4,$D4#lo,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) vmov $R4,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) vadd.i32 $S1,$S1,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) vadd.i32 $S2,$S2,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) vadd.i32 $S3,$S3,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) vadd.i32 $S4,$S4,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) vst1.32 {${S4}[0]},[$tbl0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) vst1.32 {${S4}[1]},[$tbl1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) .Lno_init_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) ret @ bx lr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) .size poly1305_init_neon,.-poly1305_init_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) .type poly1305_blocks_neon,%function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) poly1305_blocks_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) .Lpoly1305_blocks_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) ldr ip,[$ctx,#36] @ is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) cmp $len,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) blo .Lpoly1305_blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) stmdb sp!,{r4-r7}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) vstmdb sp!,{d8-d15} @ ABI specification says so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) tst ip,ip @ is_base2_26?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) bne .Lbase2_26_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) stmdb sp!,{r1-r3,lr}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) bl .Lpoly1305_init_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) ldr r4,[$ctx,#0] @ load hash value base 2^32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) ldr r5,[$ctx,#4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) ldr r6,[$ctx,#8]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) ldr r7,[$ctx,#12]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) ldr ip,[$ctx,#16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) mov r3,r4,lsr#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) veor $D0#lo,$D0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) mov r4,r5,lsr#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) orr r3,r3,r5,lsl#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) veor $D1#lo,$D1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) mov r5,r6,lsr#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) orr r4,r4,r6,lsl#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) veor $D2#lo,$D2#lo,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) mov r6,r7,lsr#8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) orr r5,r5,r7,lsl#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) veor $D3#lo,$D3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) and r3,r3,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) orr r6,r6,ip,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) veor $D4#lo,$D4#lo,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) and r4,r4,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) mov r1,#1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) and r5,r5,#0x03ffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) str r1,[$ctx,#36] @ set is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) vmov.32 $D0#lo[0],r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) vmov.32 $D1#lo[0],r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) vmov.32 $D2#lo[0],r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) vmov.32 $D3#lo[0],r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) vmov.32 $D4#lo[0],r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) adr $zeros,.Lzeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) ldmia sp!,{r1-r3,lr}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) b .Lhash_loaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) .Lbase2_26_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) @ load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) veor $D0#lo,$D0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) veor $D1#lo,$D1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) veor $D2#lo,$D2#lo,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) veor $D3#lo,$D3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) veor $D4#lo,$D4#lo,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) adr $zeros,.Lzeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) vld1.32 {$D4#lo[0]},[$ctx]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) sub $ctx,$ctx,#16 @ rewind
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) .Lhash_loaded:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) add $in2,$inp,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) mov $padbit,$padbit,lsl#24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) tst $len,#31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) beq .Leven
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) vmov.32 $H4#lo[0],$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) sub $len,$len,#16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) add $in2,$inp,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) # ifdef __ARMEB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) vrev32.8 $H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) vrev32.8 $H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) vrev32.8 $H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) vrev32.8 $H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) vshl.u32 $H3#lo,$H3#lo,#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) vsri.u32 $H3#lo,$H2#lo,#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) vshl.u32 $H2#lo,$H2#lo,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) vbic.i32 $H3#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) vsri.u32 $H2#lo,$H1#lo,#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) vshl.u32 $H1#lo,$H1#lo,#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) vbic.i32 $H2#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) vsri.u32 $H1#lo,$H0#lo,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) vadd.i32 $H3#hi,$H3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) vbic.i32 $H0#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) vbic.i32 $H1#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) vadd.i32 $H2#hi,$H2#lo,$D2#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) vadd.i32 $H0#hi,$H0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) vadd.i32 $H1#hi,$H1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) mov $tbl1,$zeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) add $tbl0,$ctx,#48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) cmp $len,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) b .Long_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) .Leven:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) subs $len,$len,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) it lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) movlo $in2,$zeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) vmov.i32 $H4,#1<<24 @ padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) add $inp,$inp,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) add $in2,$in2,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) itt hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) addhi $tbl1,$ctx,#(48+1*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) addhi $tbl0,$ctx,#(48+3*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) # ifdef __ARMEB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) vrev32.8 $H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) vrev32.8 $H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) vrev32.8 $H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) vrev32.8 $H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) vshl.u32 $H3,$H3,#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) vsri.u32 $H3,$H2,#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) vshl.u32 $H2,$H2,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) vbic.i32 $H3,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) vsri.u32 $H2,$H1,#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) vshl.u32 $H1,$H1,#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) vbic.i32 $H2,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) vsri.u32 $H1,$H0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) vbic.i32 $H0,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) vbic.i32 $H1,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) bls .Lskip_loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) b .Loop_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) .Loop_neon:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) @ \___________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) @ \___________________/ \____________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) @ Note that we start with inp[2:3]*r^2. This is because it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) @ doesn't depend on reduction in previous iteration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) @ inp[2:3]*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) vmull.u32 $D2,$H2#hi,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) vadd.i32 $H0#lo,$H0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) vmull.u32 $D0,$H0#hi,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) vadd.i32 $H3#lo,$H3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) vmull.u32 $D3,$H3#hi,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) vmlal.u32 $D2,$H1#hi,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) vadd.i32 $H1#lo,$H1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) vmull.u32 $D1,$H1#hi,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) vadd.i32 $H4#lo,$H4#lo,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) vmull.u32 $D4,$H4#hi,${R0}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) subs $len,$len,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) vmlal.u32 $D0,$H4#hi,${S1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) it lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) movlo $in2,$zeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) vmlal.u32 $D3,$H2#hi,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) vld1.32 ${S4}[1],[$tbl1,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) vmlal.u32 $D1,$H0#hi,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) vmlal.u32 $D4,$H3#hi,${R1}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) vmlal.u32 $D0,$H3#hi,${S2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) vmlal.u32 $D3,$H1#hi,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) vmlal.u32 $D4,$H2#hi,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) vmlal.u32 $D1,$H4#hi,${S2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) vmlal.u32 $D2,$H0#hi,${R2}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) vmlal.u32 $D3,$H0#hi,${R3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) vmlal.u32 $D0,$H2#hi,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) vmlal.u32 $D4,$H1#hi,${R3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) vmlal.u32 $D1,$H3#hi,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) vmlal.u32 $D2,$H4#hi,${S3}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) vmlal.u32 $D3,$H4#hi,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) vmlal.u32 $D0,$H1#hi,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) vmlal.u32 $D4,$H0#hi,${R4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) vmlal.u32 $D1,$H2#hi,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) vmlal.u32 $D2,$H3#hi,${S4}[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) add $in2,$in2,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) @ (hash+inp[0:1])*r^4 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) vmlal.u32 $D3,$H3#lo,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) vmlal.u32 $D0,$H0#lo,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) vmlal.u32 $D4,$H4#lo,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) vmlal.u32 $D1,$H1#lo,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) vmlal.u32 $D2,$H2#lo,${R0}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) vld1.32 ${S4}[0],[$tbl0,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) vmlal.u32 $D3,$H2#lo,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) vmlal.u32 $D0,$H4#lo,${S1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) vmlal.u32 $D4,$H3#lo,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) vmlal.u32 $D1,$H0#lo,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) vmlal.u32 $D2,$H1#lo,${R1}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) vmlal.u32 $D3,$H1#lo,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) vmlal.u32 $D0,$H3#lo,${S2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) vmlal.u32 $D4,$H2#lo,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) vmlal.u32 $D1,$H4#lo,${S2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) vmlal.u32 $D2,$H0#lo,${R2}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) vmlal.u32 $D3,$H0#lo,${R3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) vmlal.u32 $D0,$H2#lo,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) vmlal.u32 $D4,$H1#lo,${R3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) vmlal.u32 $D1,$H3#lo,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) vmlal.u32 $D3,$H4#lo,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) vmlal.u32 $D2,$H4#lo,${S3}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) vmlal.u32 $D0,$H1#lo,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) vmlal.u32 $D4,$H0#lo,${R4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) vmov.i32 $H4,#1<<24 @ padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) vmlal.u32 $D1,$H2#lo,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) vmlal.u32 $D2,$H3#lo,${S4}[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) add $inp,$inp,#64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) # ifdef __ARMEB__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) vrev32.8 $H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) vrev32.8 $H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) vrev32.8 $H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) vrev32.8 $H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) @ lazy reduction interleaved with base 2^32 -> base 2^26 of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) vshr.u64 $T0,$D3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) vmovn.i64 $D3#lo,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) vshr.u64 $T1,$D0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) vmovn.i64 $D0#lo,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) vadd.i64 $D4,$D4,$T0 @ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) vbic.i32 $D3#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) vadd.i64 $D1,$D1,$T1 @ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) vshl.u32 $H3,$H3,#18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) vbic.i32 $D0#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) vshrn.u64 $T0#lo,$D4,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) vmovn.i64 $D4#lo,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) vshr.u64 $T1,$D1,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) vmovn.i64 $D1#lo,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) vadd.i64 $D2,$D2,$T1 @ h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) vsri.u32 $H3,$H2,#14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) vbic.i32 $D4#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) vshl.u32 $H2,$H2,#12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) vbic.i32 $D1#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) vadd.i32 $D0#lo,$D0#lo,$T0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) vshl.u32 $T0#lo,$T0#lo,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) vbic.i32 $H3,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) vshrn.u64 $T1#lo,$D2,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) vmovn.i64 $D2#lo,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) vsri.u32 $H2,$H1,#20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) vshl.u32 $H1,$H1,#6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) vbic.i32 $D2#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) vbic.i32 $H2,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) vmovn.i64 $D0#lo,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) vsri.u32 $H1,$H0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) vbic.i32 $H0,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) vshr.u32 $T1#lo,$D3#lo,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) vbic.i32 $D3#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) vbic.i32 $D0#lo,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) vbic.i32 $H1,#0xfc000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) bhi .Loop_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) .Lskip_loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) add $tbl1,$ctx,#(48+0*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) add $tbl0,$ctx,#(48+1*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) adds $len,$len,#32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) it ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) movne $len,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) bne .Long_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) vadd.i32 $H0#hi,$H0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) vadd.i32 $H3#hi,$H3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) vadd.i32 $H1#hi,$H1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) vadd.i32 $H4#hi,$H4#lo,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) .Long_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) vmull.u32 $D2,$H2#hi,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) vadd.i32 $H0#lo,$H0#lo,$D0#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) vmull.u32 $D0,$H0#hi,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) vadd.i32 $H3#lo,$H3#lo,$D3#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) vmull.u32 $D3,$H3#hi,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) vadd.i32 $H1#lo,$H1#lo,$D1#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) vmull.u32 $D1,$H1#hi,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) vadd.i32 $H4#lo,$H4#lo,$D4#lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) vmull.u32 $D4,$H4#hi,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) vmlal.u32 $D0,$H4#hi,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) vmlal.u32 $D3,$H2#hi,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) vmlal.u32 $D1,$H0#hi,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) vmlal.u32 $D4,$H3#hi,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) vmlal.u32 $D2,$H1#hi,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) vmlal.u32 $D3,$H1#hi,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) vld1.32 ${S4}[1],[$tbl1,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) vmlal.u32 $D0,$H3#hi,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) vld1.32 ${S4}[0],[$tbl0,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) vmlal.u32 $D4,$H2#hi,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) vmlal.u32 $D1,$H4#hi,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) vmlal.u32 $D2,$H0#hi,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) vmlal.u32 $D3,$H0#hi,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) it ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) addne $tbl1,$ctx,#(48+2*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) vmlal.u32 $D0,$H2#hi,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) it ne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) addne $tbl0,$ctx,#(48+3*9*4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) vmlal.u32 $D4,$H1#hi,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) vmlal.u32 $D1,$H3#hi,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) vmlal.u32 $D2,$H4#hi,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) vmlal.u32 $D3,$H4#hi,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) vmlal.u32 $D0,$H1#hi,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) vshr.u64 $MASK,$MASK,#38
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) vmlal.u32 $D4,$H0#hi,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) vmlal.u32 $D1,$H2#hi,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) vmlal.u32 $D2,$H3#hi,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) beq .Lshort_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) @ (hash+inp[0:1])*r^4:r^3 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) vmlal.u32 $D2,$H2#lo,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) vmlal.u32 $D0,$H0#lo,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) vmlal.u32 $D3,$H3#lo,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) vmlal.u32 $D1,$H1#lo,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) vmlal.u32 $D4,$H4#lo,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) vmlal.u32 $D0,$H4#lo,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) vmlal.u32 $D3,$H2#lo,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) vmlal.u32 $D1,$H0#lo,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) vmlal.u32 $D4,$H3#lo,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) vmlal.u32 $D2,$H1#lo,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) vmlal.u32 $D3,$H1#lo,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) vld1.32 ${S4}[1],[$tbl1,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) vmlal.u32 $D0,$H3#lo,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) vld1.32 ${S4}[0],[$tbl0,:32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) vmlal.u32 $D4,$H2#lo,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) vmlal.u32 $D1,$H4#lo,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) vmlal.u32 $D2,$H0#lo,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) vmlal.u32 $D3,$H0#lo,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) vmlal.u32 $D0,$H2#lo,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) vmlal.u32 $D4,$H1#lo,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) vmlal.u32 $D1,$H3#lo,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) vmlal.u32 $D2,$H4#lo,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) vmlal.u32 $D3,$H4#lo,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) vorn $MASK,$MASK,$MASK @ all-ones
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) vmlal.u32 $D0,$H1#lo,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) vshr.u64 $MASK,$MASK,#38
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) vmlal.u32 $D4,$H0#lo,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) vmlal.u32 $D1,$H2#lo,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) vmlal.u32 $D2,$H3#lo,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) .Lshort_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) @ horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) vadd.i64 $D3#lo,$D3#lo,$D3#hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) vadd.i64 $D0#lo,$D0#lo,$D0#hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) vadd.i64 $D4#lo,$D4#lo,$D4#hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) vadd.i64 $D1#lo,$D1#lo,$D1#hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) vadd.i64 $D2#lo,$D2#lo,$D2#hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) @ lazy reduction, but without narrowing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) vshr.u64 $T0,$D3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) vand.i64 $D3,$D3,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) vshr.u64 $T1,$D0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) vand.i64 $D0,$D0,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) vadd.i64 $D4,$D4,$T0 @ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) vadd.i64 $D1,$D1,$T1 @ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) vshr.u64 $T0,$D4,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) vand.i64 $D4,$D4,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) vshr.u64 $T1,$D1,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) vand.i64 $D1,$D1,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) vadd.i64 $D2,$D2,$T1 @ h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) vadd.i64 $D0,$D0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) vshl.u64 $T0,$T0,#2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) vshr.u64 $T1,$D2,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) vand.i64 $D2,$D2,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) vadd.i64 $D0,$D0,$T0 @ h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) vadd.i64 $D3,$D3,$T1 @ h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) vshr.u64 $T0,$D0,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) vand.i64 $D0,$D0,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) vshr.u64 $T1,$D3,#26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) vand.i64 $D3,$D3,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) vadd.i64 $D1,$D1,$T0 @ h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) vadd.i64 $D4,$D4,$T1 @ h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) cmp $len,#0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) bne .Leven
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) @ store hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) vst1.32 {$D4#lo[0]},[$ctx]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) vldmia sp!,{d8-d15} @ epilogue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) ldmia sp!,{r4-r7}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) ret @ bx lr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) .size poly1305_blocks_neon,.-poly1305_blocks_neon
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) .Lzeros:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) #ifndef __KERNEL__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) .LOPENSSL_armcap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) # ifdef _WIN32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) .word OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) # else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) .word OPENSSL_armcap_P-.Lpoly1305_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) # endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) .comm OPENSSL_armcap_P,4,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) .hidden OPENSSL_armcap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) } }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) .align 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) foreach (split("\n",$code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) s/\`([^\`]*)\`/eval $1/geo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) s/\bret\b/bx lr/go or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) print $_,"\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) close STDOUT; # enforce flush