Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    1) #!/usr/bin/env perl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    2) # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    4) # Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    5) # Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    6) # Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    7) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    8) # This code is taken from the OpenSSL project but the author, Andy Polyakov,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    9) # has relicensed it under the licenses specified in the SPDX header above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   10) # The original headers, including the original license headers, are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   11) # included below for completeness.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   12) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   13) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   14) # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   15) # project. The module is, however, dual licensed under OpenSSL and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   16) # CRYPTOGAMS licenses depending on where you obtain it. For further
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   17) # details see http://www.openssl.org/~appro/cryptogams/.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   18) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   19) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   20) # This module implements Poly1305 hash for x86_64.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   21) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   22) # March 2015
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   23) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   24) # Initial release.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   25) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   26) # December 2016
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   27) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   28) # Add AVX512F+VL+BW code path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   29) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   30) # November 2017
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   31) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   32) # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   33) # executed even on Knights Landing. Trigger for modification was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   34) # observation that AVX512 code paths can negatively affect overall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   35) # Skylake-X system performance. Since we are likely to suppress
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   36) # AVX512F capability flag [at least on Skylake-X], conversion serves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   37) # as kind of "investment protection". Note that next *lake processor,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   38) # Cannonlake, has AVX512IFMA code path to execute...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   39) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   40) # Numbers are cycles per processed byte with poly1305_blocks alone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   41) # measured with rdtsc at fixed clock frequency.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   42) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   43) #		IALU/gcc-4.8(*)	AVX(**)		AVX2	AVX-512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   44) # P4		4.46/+120%	-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   45) # Core 2	2.41/+90%	-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   46) # Westmere	1.88/+120%	-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   47) # Sandy Bridge	1.39/+140%	1.10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   48) # Haswell	1.14/+175%	1.11		0.65
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   49) # Skylake[-X]	1.13/+120%	0.96		0.51	[0.35]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   50) # Silvermont	2.83/+95%	-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   51) # Knights L	3.60/?		1.65		1.10	0.41(***)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   52) # Goldmont	1.70/+180%	-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   53) # VIA Nano	1.82/+150%	-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   54) # Sledgehammer	1.38/+160%	-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   55) # Bulldozer	2.30/+130%	0.97
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   56) # Ryzen		1.15/+200%	1.08		1.18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   57) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   58) # (*)	improvement coefficients relative to clang are more modest and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   59) #	are ~50% on most processors, in both cases we are comparing to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   60) #	__int128 code;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   61) # (**)	SSE2 implementation was attempted, but among non-AVX processors
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   62) #	it was faster than integer-only code only on older Intel P4 and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   63) #	Core processors, 50-30%, less newer processor is, but slower on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   64) #	contemporary ones, for example almost 2x slower on Atom, and as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   65) #	former are naturally disappearing, SSE2 is deemed unnecessary;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   66) # (***)	strangely enough performance seems to vary from core to core,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   67) #	listed result is best case;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   68) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   69) $flavour = shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   70) $output  = shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   71) if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   72) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   73) $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   74) $kernel=0; $kernel=1 if (!$flavour && !$output);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   75) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   76) if (!$kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   77) 	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   78) 	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   79) 	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   80) 	die "can't locate x86_64-xlate.pl";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   81) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   82) 	open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   83) 	*STDOUT=*OUT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   84) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   85) 	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   86) 	    =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   87) 		$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   88) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   89) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   90) 	if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   91) 	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   92) 		$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   93) 		$avx += 1 if ($1==2.11 && $2>=8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   94) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   95) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   96) 	if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   97) 	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   98) 		$avx = ($1>=10) + ($1>=11);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   99) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  100) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  101) 	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  102) 		$avx = ($2>=3.0) + ($2>3.0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  103) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  104) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  105) 	$avx = 4; # The kernel uses ifdefs for this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  107) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  108) sub declare_function() {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  109) 	my ($name, $align, $nargs) = @_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  110) 	if($kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  111) 		$code .= ".align $align\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  112) 		$code .= "SYM_FUNC_START($name)\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  113) 		$code .= ".L$name:\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  114) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  115) 		$code .= ".globl	$name\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  116) 		$code .= ".type	$name,\@function,$nargs\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  117) 		$code .= ".align	$align\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  118) 		$code .= "$name:\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  119) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  120) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  122) sub end_function() {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  123) 	my ($name) = @_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  124) 	if($kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  125) 		$code .= "SYM_FUNC_END($name)\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  126) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  127) 		$code .= ".size   $name,.-$name\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  128) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  129) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  130) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  131) $code.=<<___ if $kernel;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  132) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  133) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  134) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  135) if ($avx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  136) $code.=<<___ if $kernel;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  137) .section .rodata
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  138) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  139) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  140) .align	64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  141) .Lconst:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  142) .Lmask24:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  143) .long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  144) .L129:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  145) .long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  146) .Lmask26:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  147) .long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  148) .Lpermd_avx2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  149) .long	2,2,2,3,2,0,2,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  150) .Lpermd_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  151) .long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  152) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  153) .L2_44_inp_permd:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  154) .long	0,1,1,2,2,3,7,7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  155) .L2_44_inp_shift:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  156) .quad	0,12,24,64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  157) .L2_44_mask:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  158) .quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  159) .L2_44_shift_rgt:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  160) .quad	44,44,42,64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  161) .L2_44_shift_lft:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  162) .quad	8,8,10,64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  163) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  164) .align	64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  165) .Lx_mask44:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  166) .quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  167) .quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  168) .Lx_mask42:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  169) .quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  170) .quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  171) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  172) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  173) $code.=<<___ if (!$kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  174) .asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  175) .align	16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  176) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  178) my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  179) my ($mac,$nonce)=($inp,$len);	# *_emit arguments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  180) my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  181) my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  182) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  183) sub poly1305_iteration {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  184) # input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  185) # output:	$h0-$h2 *= $r0-$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  186) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  187) 	mulq	$h0			# h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  188) 	mov	%rax,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  189) 	 mov	$r0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  190) 	mov	%rdx,$d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  191) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  192) 	mulq	$h0			# h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  193) 	mov	%rax,$h0		# future $h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  194) 	 mov	$r0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  195) 	mov	%rdx,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  196) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  197) 	mulq	$h1			# h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  198) 	add	%rax,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  199) 	 mov	$s1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  200) 	adc	%rdx,$d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  201) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  202) 	mulq	$h1			# h1*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  203) 	 mov	$h2,$h1			# borrow $h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  204) 	add	%rax,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  205) 	adc	%rdx,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  206) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  207) 	imulq	$s1,$h1			# h2*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  208) 	add	$h1,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  209) 	 mov	$d1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  210) 	adc	\$0,$d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  211) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  212) 	imulq	$r0,$h2			# h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  213) 	add	$d2,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  214) 	mov	\$-4,%rax		# mask value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  215) 	adc	$h2,$d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  216) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  217) 	and	$d3,%rax		# last reduction step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  218) 	mov	$d3,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  219) 	shr	\$2,$d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  220) 	and	\$3,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  221) 	add	$d3,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  222) 	add	%rax,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  223) 	adc	\$0,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  224) 	adc	\$0,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  225) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  226) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  227) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  228) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  229) # Layout of opaque area is following.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  230) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  231) #	unsigned __int64 h[3];		# current hash value base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  232) #	unsigned __int64 r[2];		# key value base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  233) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  234) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  235) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  236) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  237) $code.=<<___ if (!$kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  238) .extern	OPENSSL_ia32cap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  239) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  240) .globl	poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  241) .hidden	poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  242) .globl	poly1305_blocks_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  243) .hidden	poly1305_blocks_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  244) .globl	poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  245) .hidden	poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  246) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  247) &declare_function("poly1305_init_x86_64", 32, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  248) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  249) 	xor	%eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  250) 	mov	%rax,0($ctx)		# initialize hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  251) 	mov	%rax,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  252) 	mov	%rax,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  253) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  254) 	cmp	\$0,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  255) 	je	.Lno_key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  256) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  257) $code.=<<___ if (!$kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  258) 	lea	poly1305_blocks_x86_64(%rip),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  259) 	lea	poly1305_emit_x86_64(%rip),%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  260) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  261) $code.=<<___	if (!$kernel && $avx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  262) 	mov	OPENSSL_ia32cap_P+4(%rip),%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  263) 	lea	poly1305_blocks_avx(%rip),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  264) 	lea	poly1305_emit_avx(%rip),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  265) 	bt	\$`60-32`,%r9		# AVX?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  266) 	cmovc	%rax,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  267) 	cmovc	%rcx,%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  268) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  269) $code.=<<___	if (!$kernel && $avx>1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  270) 	lea	poly1305_blocks_avx2(%rip),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  271) 	bt	\$`5+32`,%r9		# AVX2?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  272) 	cmovc	%rax,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  273) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  274) $code.=<<___	if (!$kernel && $avx>3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  275) 	mov	\$`(1<<31|1<<21|1<<16)`,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  276) 	shr	\$32,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  277) 	and	%rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  278) 	cmp	%rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  279) 	je	.Linit_base2_44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  280) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  281) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  282) 	mov	\$0x0ffffffc0fffffff,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  283) 	mov	\$0x0ffffffc0ffffffc,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  284) 	and	0($inp),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  285) 	and	8($inp),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  286) 	mov	%rax,24($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  287) 	mov	%rcx,32($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  288) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  289) $code.=<<___	if (!$kernel && $flavour !~ /elf32/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  290) 	mov	%r10,0(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  291) 	mov	%r11,8(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  292) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  293) $code.=<<___	if (!$kernel && $flavour =~ /elf32/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  294) 	mov	%r10d,0(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  295) 	mov	%r11d,4(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  296) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  297) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  298) 	mov	\$1,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  299) .Lno_key:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  300) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  301) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  302) &end_function("poly1305_init_x86_64");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  303) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  304) &declare_function("poly1305_blocks_x86_64", 32, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  305) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  306) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  307) .Lblocks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  308) 	shr	\$4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  309) 	jz	.Lno_data		# too short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  310) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  311) 	push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  312) .cfi_push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  313) 	push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  314) .cfi_push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  315) 	push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  316) .cfi_push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  317) 	push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  318) .cfi_push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  319) 	push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  320) .cfi_push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  321) 	push	$ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  322) .cfi_push	$ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  323) .Lblocks_body:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  324) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  325) 	mov	$len,%r15		# reassign $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  326) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  327) 	mov	24($ctx),$r0		# load r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  328) 	mov	32($ctx),$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  329) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  330) 	mov	0($ctx),$h0		# load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  331) 	mov	8($ctx),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  332) 	mov	16($ctx),$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  333) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  334) 	mov	$s1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  335) 	shr	\$2,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  336) 	mov	$r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  337) 	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  338) 	jmp	.Loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  339) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  340) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  341) .Loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  342) 	add	0($inp),$h0		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  343) 	adc	8($inp),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  344) 	lea	16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  345) 	adc	$padbit,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  346) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  347) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  348) 	&poly1305_iteration();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  349) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  350) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  351) 	mov	$r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  352) 	dec	%r15			# len-=16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  353) 	jnz	.Loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  354) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  355) 	mov	0(%rsp),$ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  356) .cfi_restore	$ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  357) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  358) 	mov	$h0,0($ctx)		# store hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  359) 	mov	$h1,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  360) 	mov	$h2,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  361) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  362) 	mov	8(%rsp),%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  363) .cfi_restore	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  364) 	mov	16(%rsp),%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  365) .cfi_restore	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  366) 	mov	24(%rsp),%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  367) .cfi_restore	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  368) 	mov	32(%rsp),%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  369) .cfi_restore	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  370) 	mov	40(%rsp),%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  371) .cfi_restore	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  372) 	lea	48(%rsp),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  373) .cfi_adjust_cfa_offset	-48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  374) .Lno_data:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  375) .Lblocks_epilogue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  376) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  377) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  378) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  379) &end_function("poly1305_blocks_x86_64");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  380) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  381) &declare_function("poly1305_emit_x86_64", 32, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  382) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  383) .Lemit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  384) 	mov	0($ctx),%r8	# load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  385) 	mov	8($ctx),%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  386) 	mov	16($ctx),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  387) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  388) 	mov	%r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  389) 	add	\$5,%r8		# compare to modulus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  390) 	mov	%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  391) 	adc	\$0,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  392) 	adc	\$0,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  393) 	shr	\$2,%r10	# did 130-bit value overflow?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  394) 	cmovnz	%r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  395) 	cmovnz	%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  396) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  397) 	add	0($nonce),%rax	# accumulate nonce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  398) 	adc	8($nonce),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  399) 	mov	%rax,0($mac)	# write result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  400) 	mov	%rcx,8($mac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  401) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  402) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  403) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  404) &end_function("poly1305_emit_x86_64");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  405) if ($avx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  406) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  407) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  408) # Layout of opaque area is following.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  409) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  410) #	unsigned __int32 h[5];		# current hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  411) #	unsigned __int32 is_base2_26;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  412) #	unsigned __int64 r[2];		# key value base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  413) #	unsigned __int64 pad;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  414) #	struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  415) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  416) # where r^n are base 2^26 digits of degrees of multiplier key. There are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  417) # 5 digits, but last four are interleaved with multiples of 5, totalling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  418) # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  419) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  420) my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  421)     map("%xmm$_",(0..15));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  422) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  423) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  424) .type	__poly1305_block,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  425) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  426) __poly1305_block:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  427) 	push $ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  428) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  429) 	&poly1305_iteration();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  430) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  431) 	pop $ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  432) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  433) .size	__poly1305_block,.-__poly1305_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  434) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  435) .type	__poly1305_init_avx,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  436) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  437) __poly1305_init_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  438) 	push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  439) 	mov %rsp,%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  440) 	mov	$r0,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  441) 	mov	$r1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  442) 	xor	$h2,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  443) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  444) 	lea	48+64($ctx),$ctx	# size optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  445) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  446) 	mov	$r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  447) 	call	__poly1305_block	# r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  448) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  449) 	mov	\$0x3ffffff,%eax	# save interleaved r^2 and r base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  450) 	mov	\$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  451) 	mov	$h0,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  452) 	and	$h0#d,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  453) 	mov	$r0,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  454) 	and	$r0#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  455) 	mov	%eax,`16*0+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  456) 	shr	\$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  457) 	mov	%edx,`16*0+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  458) 	shr	\$26,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  459) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  460) 	mov	\$0x3ffffff,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  461) 	mov	\$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  462) 	and	$d1#d,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  463) 	and	$d2#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  464) 	mov	%eax,`16*1+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  465) 	lea	(%rax,%rax,4),%eax	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  466) 	mov	%edx,`16*1+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  467) 	lea	(%rdx,%rdx,4),%edx	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  468) 	mov	%eax,`16*2+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  469) 	shr	\$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  470) 	mov	%edx,`16*2+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  471) 	shr	\$26,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  472) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  473) 	mov	$h1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  474) 	mov	$r1,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  475) 	shl	\$12,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  476) 	shl	\$12,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  477) 	or	$d1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  478) 	or	$d2,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  479) 	and	\$0x3ffffff,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  480) 	and	\$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  481) 	mov	%eax,`16*3+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  482) 	lea	(%rax,%rax,4),%eax	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  483) 	mov	%edx,`16*3+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  484) 	lea	(%rdx,%rdx,4),%edx	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  485) 	mov	%eax,`16*4+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  486) 	mov	$h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  487) 	mov	%edx,`16*4+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  488) 	mov	$r1,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  489) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  490) 	mov	\$0x3ffffff,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  491) 	mov	\$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  492) 	shr	\$14,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  493) 	shr	\$14,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  494) 	and	$d1#d,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  495) 	and	$d2#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  496) 	mov	%eax,`16*5+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  497) 	lea	(%rax,%rax,4),%eax	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  498) 	mov	%edx,`16*5+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  499) 	lea	(%rdx,%rdx,4),%edx	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  500) 	mov	%eax,`16*6+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  501) 	shr	\$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  502) 	mov	%edx,`16*6+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  503) 	shr	\$26,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  504) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  505) 	mov	$h2,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  506) 	shl	\$24,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  507) 	or	%rax,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  508) 	mov	$d1#d,`16*7+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  509) 	lea	($d1,$d1,4),$d1		# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  510) 	mov	$d2#d,`16*7+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  511) 	lea	($d2,$d2,4),$d2		# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  512) 	mov	$d1#d,`16*8+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  513) 	mov	$d2#d,`16*8+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  514) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  515) 	mov	$r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  516) 	call	__poly1305_block	# r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  517) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  518) 	mov	\$0x3ffffff,%eax	# save r^3 base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  519) 	mov	$h0,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  520) 	and	$h0#d,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  521) 	shr	\$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  522) 	mov	%eax,`16*0+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  523) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  524) 	mov	\$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  525) 	and	$d1#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  526) 	mov	%edx,`16*1+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  527) 	lea	(%rdx,%rdx,4),%edx	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  528) 	shr	\$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  529) 	mov	%edx,`16*2+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  530) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  531) 	mov	$h1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  532) 	shl	\$12,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  533) 	or	$d1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  534) 	and	\$0x3ffffff,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  535) 	mov	%eax,`16*3+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  536) 	lea	(%rax,%rax,4),%eax	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  537) 	mov	$h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  538) 	mov	%eax,`16*4+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  539) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  540) 	mov	\$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  541) 	shr	\$14,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  542) 	and	$d1#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  543) 	mov	%edx,`16*5+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  544) 	lea	(%rdx,%rdx,4),%edx	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  545) 	shr	\$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  546) 	mov	%edx,`16*6+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  547) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  548) 	mov	$h2,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  549) 	shl	\$24,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  550) 	or	%rax,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  551) 	mov	$d1#d,`16*7+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  552) 	lea	($d1,$d1,4),$d1		# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  553) 	mov	$d1#d,`16*8+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  554) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  555) 	mov	$r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  556) 	call	__poly1305_block	# r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  557) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  558) 	mov	\$0x3ffffff,%eax	# save r^4 base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  559) 	mov	$h0,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  560) 	and	$h0#d,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  561) 	shr	\$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  562) 	mov	%eax,`16*0+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  563) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  564) 	mov	\$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  565) 	and	$d1#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  566) 	mov	%edx,`16*1+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  567) 	lea	(%rdx,%rdx,4),%edx	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  568) 	shr	\$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  569) 	mov	%edx,`16*2+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  570) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  571) 	mov	$h1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  572) 	shl	\$12,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  573) 	or	$d1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  574) 	and	\$0x3ffffff,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  575) 	mov	%eax,`16*3+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  576) 	lea	(%rax,%rax,4),%eax	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  577) 	mov	$h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  578) 	mov	%eax,`16*4+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  579) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  580) 	mov	\$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  581) 	shr	\$14,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  582) 	and	$d1#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  583) 	mov	%edx,`16*5+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  584) 	lea	(%rdx,%rdx,4),%edx	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  585) 	shr	\$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  586) 	mov	%edx,`16*6+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  587) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  588) 	mov	$h2,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  589) 	shl	\$24,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  590) 	or	%rax,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  591) 	mov	$d1#d,`16*7+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  592) 	lea	($d1,$d1,4),$d1		# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  593) 	mov	$d1#d,`16*8+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  594) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  595) 	lea	-48-64($ctx),$ctx	# size [de-]optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  596) 	pop %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  597) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  598) .size	__poly1305_init_avx,.-__poly1305_init_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  599) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  600) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  601) &declare_function("poly1305_blocks_avx", 32, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  602) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  603) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  604) 	mov	20($ctx),%r8d		# is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  605) 	cmp	\$128,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  606) 	jae	.Lblocks_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  607) 	test	%r8d,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  608) 	jz	.Lblocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  609) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  610) .Lblocks_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  611) 	and	\$-16,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  612) 	jz	.Lno_data_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  613) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  614) 	vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  615) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  616) 	test	%r8d,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  617) 	jz	.Lbase2_64_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  618) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  619) 	test	\$31,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  620) 	jz	.Leven_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  621) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  622) 	push	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  623) .cfi_push	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  624) 	mov 	%rsp,%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  625) 	push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  626) .cfi_push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  627) 	push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  628) .cfi_push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  629) 	push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  630) .cfi_push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  631) 	push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  632) .cfi_push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  633) 	push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  634) .cfi_push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  635) .Lblocks_avx_body:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  636) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  637) 	mov	$len,%r15		# reassign $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  638) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  639) 	mov	0($ctx),$d1		# load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  640) 	mov	8($ctx),$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  641) 	mov	16($ctx),$h2#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  642) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  643) 	mov	24($ctx),$r0		# load r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  644) 	mov	32($ctx),$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  645) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  646) 	################################# base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  647) 	mov	$d1#d,$h0#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  648) 	and	\$`-1*(1<<31)`,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  649) 	mov	$d2,$r1			# borrow $r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  650) 	mov	$d2#d,$h1#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  651) 	and	\$`-1*(1<<31)`,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  652) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  653) 	shr	\$6,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  654) 	shl	\$52,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  655) 	add	$d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  656) 	shr	\$12,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  657) 	shr	\$18,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  658) 	add	$r1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  659) 	adc	$d2,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  660) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  661) 	mov	$h2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  662) 	shl	\$40,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  663) 	shr	\$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  664) 	add	$d1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  665) 	adc	\$0,$h2			# can be partially reduced...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  666) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  667) 	mov	\$-4,$d2		# ... so reduce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  668) 	mov	$h2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  669) 	and	$h2,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  670) 	shr	\$2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  671) 	and	\$3,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  672) 	add	$d2,$d1			# =*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  673) 	add	$d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  674) 	adc	\$0,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  675) 	adc	\$0,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  676) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  677) 	mov	$s1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  678) 	mov	$s1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  679) 	shr	\$2,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  680) 	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  681) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  682) 	add	0($inp),$h0		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  683) 	adc	8($inp),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  684) 	lea	16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  685) 	adc	$padbit,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  686) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  687) 	call	__poly1305_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  688) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  689) 	test	$padbit,$padbit		# if $padbit is zero,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  690) 	jz	.Lstore_base2_64_avx	# store hash in base 2^64 format
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  691) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  692) 	################################# base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  693) 	mov	$h0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  694) 	mov	$h0,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  695) 	shr	\$52,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  696) 	mov	$h1,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  697) 	mov	$h1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  698) 	shr	\$26,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  699) 	and	\$0x3ffffff,%rax	# h[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  700) 	shl	\$12,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  701) 	and	\$0x3ffffff,%rdx	# h[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  702) 	shr	\$14,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  703) 	or	$r0,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  704) 	shl	\$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  705) 	and	\$0x3ffffff,$h0		# h[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  706) 	shr	\$40,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  707) 	and	\$0x3ffffff,$h1		# h[3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  708) 	or	$r1,$h2			# h[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  709) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  710) 	sub	\$16,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  711) 	jz	.Lstore_base2_26_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  712) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  713) 	vmovd	%rax#d,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  714) 	vmovd	%rdx#d,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  715) 	vmovd	$h0#d,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  716) 	vmovd	$h1#d,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  717) 	vmovd	$h2#d,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  718) 	jmp	.Lproceed_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  719) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  720) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  721) .Lstore_base2_64_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  722) 	mov	$h0,0($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  723) 	mov	$h1,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  724) 	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  725) 	jmp	.Ldone_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  726) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  727) .align	16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  728) .Lstore_base2_26_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  729) 	mov	%rax#d,0($ctx)		# store hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  730) 	mov	%rdx#d,4($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  731) 	mov	$h0#d,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  732) 	mov	$h1#d,12($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  733) 	mov	$h2#d,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  734) .align	16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  735) .Ldone_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  736) 	pop 		%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  737) .cfi_restore	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  738) 	pop 		%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  739) .cfi_restore	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  740) 	pop 		%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  741) .cfi_restore	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  742) 	pop 		%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  743) .cfi_restore	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  744) 	pop 		%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  745) .cfi_restore	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  746) 	pop 		%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  747) .cfi_restore	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  748) .Lno_data_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  749) .Lblocks_avx_epilogue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  750) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  751) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  752) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  753) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  754) .Lbase2_64_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  755) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  756) 	push	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  757) .cfi_push	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  758) 	mov 	%rsp,%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  759) 	push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  760) .cfi_push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  761) 	push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  762) .cfi_push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  763) 	push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  764) .cfi_push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  765) 	push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  766) .cfi_push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  767) 	push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  768) .cfi_push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  769) .Lbase2_64_avx_body:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  770) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  771) 	mov	$len,%r15		# reassign $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  772) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  773) 	mov	24($ctx),$r0		# load r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  774) 	mov	32($ctx),$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  775) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  776) 	mov	0($ctx),$h0		# load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  777) 	mov	8($ctx),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  778) 	mov	16($ctx),$h2#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  779) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  780) 	mov	$s1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  781) 	mov	$s1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  782) 	shr	\$2,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  783) 	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  784) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  785) 	test	\$31,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  786) 	jz	.Linit_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  787) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  788) 	add	0($inp),$h0		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  789) 	adc	8($inp),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  790) 	lea	16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  791) 	adc	$padbit,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  792) 	sub	\$16,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  793) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  794) 	call	__poly1305_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  795) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  796) .Linit_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  797) 	################################# base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  798) 	mov	$h0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  799) 	mov	$h0,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  800) 	shr	\$52,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  801) 	mov	$h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  802) 	mov	$h1,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  803) 	shr	\$26,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  804) 	and	\$0x3ffffff,%rax	# h[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  805) 	shl	\$12,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  806) 	and	\$0x3ffffff,%rdx	# h[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  807) 	shr	\$14,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  808) 	or	$d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  809) 	shl	\$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  810) 	and	\$0x3ffffff,$h0		# h[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  811) 	shr	\$40,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  812) 	and	\$0x3ffffff,$h1		# h[3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  813) 	or	$d2,$h2			# h[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  814) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  815) 	vmovd	%rax#d,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  816) 	vmovd	%rdx#d,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  817) 	vmovd	$h0#d,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  818) 	vmovd	$h1#d,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  819) 	vmovd	$h2#d,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  820) 	movl	\$1,20($ctx)		# set is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  821) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  822) 	call	__poly1305_init_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  823) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  824) .Lproceed_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  825) 	mov	%r15,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  826) 	pop 		%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  827) .cfi_restore	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  828) 	pop 		%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  829) .cfi_restore	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  830) 	pop 		%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  831) .cfi_restore	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  832) 	pop 		%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  833) .cfi_restore	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  834) 	pop 		%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  835) .cfi_restore	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  836) 	pop 		%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  837) .cfi_restore	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  838) .Lbase2_64_avx_epilogue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  839) 	jmp	.Ldo_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  840) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  841) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  842) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  843) .Leven_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  844) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  845) 	vmovd		4*0($ctx),$H0		# load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  846) 	vmovd		4*1($ctx),$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  847) 	vmovd		4*2($ctx),$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  848) 	vmovd		4*3($ctx),$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  849) 	vmovd		4*4($ctx),$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  850) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  851) .Ldo_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  852) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  853) $code.=<<___	if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  854) 	lea		8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  855) .cfi_def_cfa_register	%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  856) 	and		\$-32,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  857) 	sub		\$-8,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  858) 	lea		-0x58(%rsp),%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  859) 	sub		\$0x178,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  860) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  861) $code.=<<___	if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  862) 	lea		-0xf8(%rsp),%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  863) 	sub		\$0x218,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  864) 	vmovdqa		%xmm6,0x50(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  865) 	vmovdqa		%xmm7,0x60(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  866) 	vmovdqa		%xmm8,0x70(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  867) 	vmovdqa		%xmm9,0x80(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  868) 	vmovdqa		%xmm10,0x90(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  869) 	vmovdqa		%xmm11,0xa0(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  870) 	vmovdqa		%xmm12,0xb0(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  871) 	vmovdqa		%xmm13,0xc0(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  872) 	vmovdqa		%xmm14,0xd0(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  873) 	vmovdqa		%xmm15,0xe0(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  874) .Ldo_avx_body:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  875) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  876) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  877) 	sub		\$64,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  878) 	lea		-32($inp),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  879) 	cmovc		%rax,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  880) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  881) 	vmovdqu		`16*3`($ctx),$D4	# preload r0^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  882) 	lea		`16*3+64`($ctx),$ctx	# size optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  883) 	lea		.Lconst(%rip),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  884) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  885) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  886) 	# load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  887) 	vmovdqu		16*2($inp),$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  888) 	vmovdqu		16*3($inp),$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  889) 	vmovdqa		64(%rcx),$MASK		# .Lmask26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  890) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  891) 	vpsrldq		\$6,$T0,$T2		# splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  892) 	vpsrldq		\$6,$T1,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  893) 	vpunpckhqdq	$T1,$T0,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  894) 	vpunpcklqdq	$T1,$T0,$T0		# 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  895) 	vpunpcklqdq	$T3,$T2,$T3		# 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  896) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  897) 	vpsrlq		\$40,$T4,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  898) 	vpsrlq		\$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  899) 	vpand		$MASK,$T0,$T0		# 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  900) 	vpsrlq		\$4,$T3,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  901) 	vpand		$MASK,$T1,$T1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  902) 	vpsrlq		\$30,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  903) 	vpand		$MASK,$T2,$T2		# 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  904) 	vpand		$MASK,$T3,$T3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  905) 	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  906) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  907) 	jbe		.Lskip_loop_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  908) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  909) 	# expand and copy pre-calculated table to stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  910) 	vmovdqu		`16*1-64`($ctx),$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  911) 	vmovdqu		`16*2-64`($ctx),$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  912) 	vpshufd		\$0xEE,$D4,$D3		# 34xx -> 3434
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  913) 	vpshufd		\$0x44,$D4,$D0		# xx12 -> 1212
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  914) 	vmovdqa		$D3,-0x90(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  915) 	vmovdqa		$D0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  916) 	vpshufd		\$0xEE,$D1,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  917) 	vmovdqu		`16*3-64`($ctx),$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  918) 	vpshufd		\$0x44,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  919) 	vmovdqa		$D4,-0x80(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  920) 	vmovdqa		$D1,0x10(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  921) 	vpshufd		\$0xEE,$D2,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  922) 	vmovdqu		`16*4-64`($ctx),$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  923) 	vpshufd		\$0x44,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  924) 	vmovdqa		$D3,-0x70(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  925) 	vmovdqa		$D2,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  926) 	vpshufd		\$0xEE,$D0,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  927) 	vmovdqu		`16*5-64`($ctx),$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  928) 	vpshufd		\$0x44,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  929) 	vmovdqa		$D4,-0x60(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  930) 	vmovdqa		$D0,0x30(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  931) 	vpshufd		\$0xEE,$D1,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  932) 	vmovdqu		`16*6-64`($ctx),$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  933) 	vpshufd		\$0x44,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  934) 	vmovdqa		$D3,-0x50(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  935) 	vmovdqa		$D1,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  936) 	vpshufd		\$0xEE,$D2,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  937) 	vmovdqu		`16*7-64`($ctx),$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  938) 	vpshufd		\$0x44,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  939) 	vmovdqa		$D4,-0x40(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  940) 	vmovdqa		$D2,0x50(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  941) 	vpshufd		\$0xEE,$D0,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  942) 	vmovdqu		`16*8-64`($ctx),$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  943) 	vpshufd		\$0x44,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  944) 	vmovdqa		$D3,-0x30(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  945) 	vmovdqa		$D0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  946) 	vpshufd		\$0xEE,$D1,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  947) 	vpshufd		\$0x44,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  948) 	vmovdqa		$D4,-0x20(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  949) 	vmovdqa		$D1,0x70(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  950) 	vpshufd		\$0xEE,$D2,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  951) 	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  952) 	vpshufd		\$0x44,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  953) 	vmovdqa		$D3,-0x10(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  954) 	vmovdqa		$D2,0x80(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  955) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  956) 	jmp		.Loop_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  957) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  958) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  959) .Loop_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  960) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  961) 	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  962) 	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  963) 	#   \___________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  964) 	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  965) 	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  966) 	#   \___________________/ \____________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  967) 	#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  968) 	# Note that we start with inp[2:3]*r^2. This is because it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  969) 	# doesn't depend on reduction in previous iteration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  970) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  971) 	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  972) 	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  973) 	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  974) 	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  975) 	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  976) 	#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  977) 	# though note that $Tx and $Hx are "reversed" in this section,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  978) 	# and $D4 is preloaded with r0^2...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  979) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  980) 	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  981) 	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  982) 	  vmovdqa	$H2,0x20(%r11)				# offload hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  983) 	vpmuludq	$T2,$D4,$D2		# d3 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  984) 	 vmovdqa	0x10(%rsp),$H2		# r1^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  985) 	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  986) 	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  987) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  988) 	  vmovdqa	$H0,0x00(%r11)				#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  989) 	vpmuludq	0x20(%rsp),$T4,$H0	# h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  990) 	  vmovdqa	$H1,0x10(%r11)				#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  991) 	vpmuludq	$T3,$H2,$H1		# h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  992) 	vpaddq		$H0,$D0,$D0		# d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  993) 	vpaddq		$H1,$D4,$D4		# d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  994) 	  vmovdqa	$H3,0x30(%r11)				#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  995) 	vpmuludq	$T2,$H2,$H0		# h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  996) 	vpmuludq	$T1,$H2,$H1		# h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  997) 	vpaddq		$H0,$D3,$D3		# d3 += h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  998) 	 vmovdqa	0x30(%rsp),$H3		# r2^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  999) 	vpaddq		$H1,$D2,$D2		# d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) 	  vmovdqa	$H4,0x40(%r11)				#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) 	vpmuludq	$T0,$H2,$H2		# h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) 	 vpmuludq	$T2,$H3,$H0		# h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) 	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) 	 vmovdqa	0x40(%rsp),$H4		# s2^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) 	vpaddq		$H0,$D4,$D4		# d4 += h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) 	vpmuludq	$T1,$H3,$H1		# h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) 	vpmuludq	$T0,$H3,$H3		# h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) 	vpaddq		$H1,$D3,$D3		# d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) 	 vmovdqa	0x50(%rsp),$H2		# r3^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) 	vpaddq		$H3,$D2,$D2		# d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) 	vpmuludq	$T4,$H4,$H0		# h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) 	vpmuludq	$T3,$H4,$H4		# h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) 	vpaddq		$H0,$D1,$D1		# d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) 	 vmovdqa	0x60(%rsp),$H3		# s3^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) 	vpaddq		$H4,$D0,$D0		# d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) 	 vmovdqa	0x80(%rsp),$H4		# s4^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) 	vpmuludq	$T1,$H2,$H1		# h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) 	vpmuludq	$T0,$H2,$H2		# h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) 	vpaddq		$H1,$D4,$D4		# d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) 	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) 	vpmuludq	$T4,$H3,$H0		# h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) 	vpmuludq	$T3,$H3,$H1		# h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) 	vpaddq		$H0,$D2,$D2		# d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) 	 vmovdqu	16*0($inp),$H0				# load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) 	vpaddq		$H1,$D1,$D1		# d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) 	vpmuludq	$T2,$H3,$H3		# h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) 	 vpmuludq	$T2,$H4,$T2		# h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) 	vpaddq		$H3,$D0,$D0		# d0 += h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) 	 vmovdqu	16*1($inp),$H1				#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) 	vpaddq		$T2,$D1,$D1		# d1 += h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) 	vpmuludq	$T3,$H4,$T3		# h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) 	vpmuludq	$T4,$H4,$T4		# h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) 	 vpsrldq	\$6,$H0,$H2				# splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) 	vpaddq		$T3,$D2,$D2		# d2 += h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) 	vpaddq		$T4,$D3,$D3		# d3 += h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) 	 vpsrldq	\$6,$H1,$H3				#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) 	vpmuludq	0x70(%rsp),$T0,$T4	# h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) 	vpmuludq	$T1,$H4,$T0		# h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) 	 vpunpckhqdq	$H1,$H0,$H4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) 	vpaddq		$T4,$D4,$D4		# d4 += h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) 	 vmovdqa	-0x90(%r11),$T4		# r0^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) 	vpaddq		$T0,$D0,$D0		# d0 += h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) 	vpunpcklqdq	$H1,$H0,$H0		# 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) 	vpunpcklqdq	$H3,$H2,$H3		# 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) 	#vpsrlq		\$40,$H4,$H4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) 	vpsrldq		\$`40/8`,$H4,$H4	# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) 	vpsrlq		\$26,$H0,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) 	vpand		$MASK,$H0,$H0		# 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) 	vpsrlq		\$4,$H3,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) 	vpand		$MASK,$H1,$H1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) 	vpand		0(%rcx),$H4,$H4		# .Lmask24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) 	vpsrlq		\$30,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) 	vpand		$MASK,$H2,$H2		# 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) 	vpand		$MASK,$H3,$H3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) 	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) 	vpaddq		0x00(%r11),$H0,$H0	# add hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) 	vpaddq		0x10(%r11),$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) 	vpaddq		0x20(%r11),$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) 	vpaddq		0x30(%r11),$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) 	vpaddq		0x40(%r11),$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) 	lea		16*2($inp),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) 	lea		16*4($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) 	sub		\$64,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) 	cmovc		%rax,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) 	# Now we accumulate (inp[0:1]+hash)*r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) 	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) 	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) 	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) 	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) 	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) 	vpmuludq	$H0,$T4,$T0		# h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) 	vpmuludq	$H1,$T4,$T1		# h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) 	vpaddq		$T0,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) 	vpaddq		$T1,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) 	 vmovdqa	-0x80(%r11),$T2		# r1^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) 	vpmuludq	$H2,$T4,$T0		# h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) 	vpmuludq	$H3,$T4,$T1		# h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) 	vpaddq		$T0,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) 	vpaddq		$T1,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) 	vpmuludq	$H4,$T4,$T4		# h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) 	 vpmuludq	-0x70(%r11),$H4,$T0	# h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) 	vpaddq		$T4,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) 	vpaddq		$T0,$D0,$D0		# d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) 	vpmuludq	$H2,$T2,$T1		# h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) 	vpmuludq	$H3,$T2,$T0		# h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) 	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) 	 vmovdqa	-0x60(%r11),$T3		# r2^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) 	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) 	vpmuludq	$H1,$T2,$T1		# h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) 	vpmuludq	$H0,$T2,$T2		# h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) 	vpaddq		$T1,$D2,$D2		# d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) 	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) 	 vmovdqa	-0x50(%r11),$T4		# s2^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) 	vpmuludq	$H2,$T3,$T0		# h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) 	vpmuludq	$H1,$T3,$T1		# h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) 	vpaddq		$T0,$D4,$D4		# d4 += h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) 	vpaddq		$T1,$D3,$D3		# d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) 	 vmovdqa	-0x40(%r11),$T2		# r3^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) 	vpmuludq	$H0,$T3,$T3		# h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) 	vpmuludq	$H4,$T4,$T0		# h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) 	vpaddq		$T3,$D2,$D2		# d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) 	vpaddq		$T0,$D1,$D1		# d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) 	 vmovdqa	-0x30(%r11),$T3		# s3^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) 	vpmuludq	$H3,$T4,$T4		# h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) 	 vpmuludq	$H1,$T2,$T1		# h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) 	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) 	 vmovdqa	-0x10(%r11),$T4		# s4^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) 	vpaddq		$T1,$D4,$D4		# d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) 	vpmuludq	$H0,$T2,$T2		# h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) 	vpmuludq	$H4,$T3,$T0		# h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) 	vpaddq		$T2,$D3,$D3		# d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) 	vpaddq		$T0,$D2,$D2		# d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) 	 vmovdqu	16*2($inp),$T0				# load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) 	vpmuludq	$H3,$T3,$T2		# h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) 	vpmuludq	$H2,$T3,$T3		# h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) 	vpaddq		$T2,$D1,$D1		# d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) 	 vmovdqu	16*3($inp),$T1				#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) 	vpaddq		$T3,$D0,$D0		# d0 += h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) 	vpmuludq	$H2,$T4,$H2		# h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) 	vpmuludq	$H3,$T4,$H3		# h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) 	 vpsrldq	\$6,$T0,$T2				# splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) 	vpaddq		$H2,$D1,$D1		# d1 += h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) 	vpmuludq	$H4,$T4,$H4		# h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) 	 vpsrldq	\$6,$T1,$T3				#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) 	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) 	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) 	vpmuludq	-0x20(%r11),$H0,$H4	# h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) 	vpmuludq	$H1,$T4,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) 	 vpunpckhqdq	$T1,$T0,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) 	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) 	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) 	vpunpcklqdq	$T1,$T0,$T0		# 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) 	vpunpcklqdq	$T3,$T2,$T3		# 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) 	#vpsrlq		\$40,$T4,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) 	vpsrldq		\$`40/8`,$T4,$T4	# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) 	vpsrlq		\$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) 	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) 	vpand		$MASK,$T0,$T0		# 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) 	vpsrlq		\$4,$T3,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) 	vpand		$MASK,$T1,$T1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) 	vpand		0(%rcx),$T4,$T4		# .Lmask24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) 	vpsrlq		\$30,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) 	vpand		$MASK,$T2,$T2		# 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) 	vpand		$MASK,$T3,$T3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) 	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) 	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) 	# and P. Schwabe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) 	vpsrlq		\$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) 	vpand		$MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) 	vpaddq		$D3,$H4,$H4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) 	vpsrlq		\$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) 	vpand		$MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) 	vpaddq		$D0,$D1,$H1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) 	vpsrlq		\$26,$H4,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) 	vpand		$MASK,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) 	vpsrlq		\$26,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) 	vpand		$MASK,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) 	vpaddq		$D1,$H2,$H2		# h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) 	vpaddq		$D0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) 	vpsllq		\$2,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) 	vpaddq		$D0,$H0,$H0		# h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) 	vpsrlq		\$26,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) 	vpand		$MASK,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) 	vpaddq		$D2,$H3,$H3		# h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) 	vpsrlq		\$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) 	vpand		$MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) 	vpaddq		$D0,$H1,$H1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) 	vpsrlq		\$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) 	vpand		$MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) 	vpaddq		$D3,$H4,$H4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) 	ja		.Loop_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) .Lskip_loop_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) 	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) 	vpshufd		\$0x10,$D4,$D4		# r0^n, xx12 -> x1x2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) 	add		\$32,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) 	jnz		.Long_tail_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) 	vpaddq		$H2,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) 	vpaddq		$H0,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) 	vpaddq		$H1,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) 	vpaddq		$H3,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) 	vpaddq		$H4,$T4,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) .Long_tail_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) 	vmovdqa		$H2,0x20(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) 	vmovdqa		$H0,0x00(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) 	vmovdqa		$H1,0x10(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) 	vmovdqa		$H3,0x30(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) 	vmovdqa		$H4,0x40(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) 	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) 	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) 	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) 	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) 	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) 	vpmuludq	$T2,$D4,$D2		# d2 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) 	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) 	 vpshufd	\$0x10,`16*1-64`($ctx),$H2		# r1^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) 	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) 	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) 	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) 	vpmuludq	$T3,$H2,$H0		# h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) 	vpaddq		$H0,$D4,$D4		# d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) 	 vpshufd	\$0x10,`16*2-64`($ctx),$H3		# s1^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) 	vpmuludq	$T2,$H2,$H1		# h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) 	vpaddq		$H1,$D3,$D3		# d3 += h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) 	 vpshufd	\$0x10,`16*3-64`($ctx),$H4		# r2^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) 	vpmuludq	$T1,$H2,$H0		# h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) 	vpaddq		$H0,$D2,$D2		# d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) 	vpmuludq	$T0,$H2,$H2		# h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) 	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) 	vpmuludq	$T4,$H3,$H3		# h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) 	vpaddq		$H3,$D0,$D0		# d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) 	 vpshufd	\$0x10,`16*4-64`($ctx),$H2		# s2^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) 	vpmuludq	$T2,$H4,$H1		# h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) 	vpaddq		$H1,$D4,$D4		# d4 += h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) 	vpmuludq	$T1,$H4,$H0		# h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) 	vpaddq		$H0,$D3,$D3		# d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) 	 vpshufd	\$0x10,`16*5-64`($ctx),$H3		# r3^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) 	vpmuludq	$T0,$H4,$H4		# h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) 	vpaddq		$H4,$D2,$D2		# d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) 	vpmuludq	$T4,$H2,$H1		# h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) 	vpaddq		$H1,$D1,$D1		# d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) 	 vpshufd	\$0x10,`16*6-64`($ctx),$H4		# s3^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) 	vpmuludq	$T3,$H2,$H2		# h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) 	vpaddq		$H2,$D0,$D0		# d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) 	vpmuludq	$T1,$H3,$H0		# h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) 	vpaddq		$H0,$D4,$D4		# d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) 	vpmuludq	$T0,$H3,$H3		# h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) 	vpaddq		$H3,$D3,$D3		# d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) 	 vpshufd	\$0x10,`16*7-64`($ctx),$H2		# r4^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) 	vpmuludq	$T4,$H4,$H1		# h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) 	vpaddq		$H1,$D2,$D2		# d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) 	 vpshufd	\$0x10,`16*8-64`($ctx),$H3		# s4^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) 	vpmuludq	$T3,$H4,$H0		# h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) 	vpaddq		$H0,$D1,$D1		# d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) 	vpmuludq	$T2,$H4,$H4		# h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) 	vpaddq		$H4,$D0,$D0		# d0 += h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) 	vpmuludq	$T0,$H2,$H2		# h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) 	vpaddq		$H2,$D4,$D4		# h4 = d4 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) 	vpmuludq	$T4,$H3,$H1		# h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) 	vpaddq		$H1,$D3,$D3		# h3 = d3 + h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) 	vpmuludq	$T3,$H3,$H0		# h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) 	vpaddq		$H0,$D2,$D2		# h2 = d2 + h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) 	vpmuludq	$T2,$H3,$H1		# h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) 	vpaddq		$H1,$D1,$D1		# h1 = d1 + h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) 	vpmuludq	$T1,$H3,$H3		# h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) 	vpaddq		$H3,$D0,$D0		# h0 = d0 + h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) 	jz		.Lshort_tail_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) 	vmovdqu		16*0($inp),$H0		# load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) 	vmovdqu		16*1($inp),$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) 	vpsrldq		\$6,$H0,$H2		# splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) 	vpsrldq		\$6,$H1,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) 	vpunpckhqdq	$H1,$H0,$H4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) 	vpunpcklqdq	$H1,$H0,$H0		# 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) 	vpunpcklqdq	$H3,$H2,$H3		# 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) 	vpsrlq		\$40,$H4,$H4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) 	vpsrlq		\$26,$H0,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) 	vpand		$MASK,$H0,$H0		# 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) 	vpsrlq		\$4,$H3,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) 	vpand		$MASK,$H1,$H1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) 	vpsrlq		\$30,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) 	vpand		$MASK,$H2,$H2		# 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) 	vpand		$MASK,$H3,$H3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) 	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) 	vpshufd		\$0x32,`16*0-64`($ctx),$T4	# r0^n, 34xx -> x3x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) 	vpaddq		0x00(%r11),$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) 	vpaddq		0x10(%r11),$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) 	vpaddq		0x20(%r11),$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) 	vpaddq		0x30(%r11),$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) 	vpaddq		0x40(%r11),$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) 	# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) 	vpmuludq	$H0,$T4,$T0		# h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) 	vpaddq		$T0,$D0,$D0		# d0 += h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) 	vpmuludq	$H1,$T4,$T1		# h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) 	vpaddq		$T1,$D1,$D1		# d1 += h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) 	vpmuludq	$H2,$T4,$T0		# h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) 	vpaddq		$T0,$D2,$D2		# d2 += h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) 	 vpshufd	\$0x32,`16*1-64`($ctx),$T2		# r1^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) 	vpmuludq	$H3,$T4,$T1		# h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) 	vpaddq		$T1,$D3,$D3		# d3 += h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) 	vpmuludq	$H4,$T4,$T4		# h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) 	vpaddq		$T4,$D4,$D4		# d4 += h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) 	vpmuludq	$H3,$T2,$T0		# h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) 	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) 	 vpshufd	\$0x32,`16*2-64`($ctx),$T3		# s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) 	vpmuludq	$H2,$T2,$T1		# h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) 	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) 	 vpshufd	\$0x32,`16*3-64`($ctx),$T4		# r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) 	vpmuludq	$H1,$T2,$T0		# h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) 	vpaddq		$T0,$D2,$D2		# d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) 	vpmuludq	$H0,$T2,$T2		# h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) 	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) 	vpmuludq	$H4,$T3,$T3		# h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) 	vpaddq		$T3,$D0,$D0		# d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) 	 vpshufd	\$0x32,`16*4-64`($ctx),$T2		# s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) 	vpmuludq	$H2,$T4,$T1		# h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) 	vpaddq		$T1,$D4,$D4		# d4 += h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) 	vpmuludq	$H1,$T4,$T0		# h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) 	vpaddq		$T0,$D3,$D3		# d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) 	 vpshufd	\$0x32,`16*5-64`($ctx),$T3		# r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) 	vpmuludq	$H0,$T4,$T4		# h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) 	vpaddq		$T4,$D2,$D2		# d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) 	vpmuludq	$H4,$T2,$T1		# h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) 	vpaddq		$T1,$D1,$D1		# d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) 	 vpshufd	\$0x32,`16*6-64`($ctx),$T4		# s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) 	vpmuludq	$H3,$T2,$T2		# h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) 	vpaddq		$T2,$D0,$D0		# d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) 	vpmuludq	$H1,$T3,$T0		# h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) 	vpaddq		$T0,$D4,$D4		# d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) 	vpmuludq	$H0,$T3,$T3		# h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) 	vpaddq		$T3,$D3,$D3		# d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) 	 vpshufd	\$0x32,`16*7-64`($ctx),$T2		# r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) 	vpmuludq	$H4,$T4,$T1		# h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) 	vpaddq		$T1,$D2,$D2		# d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) 	 vpshufd	\$0x32,`16*8-64`($ctx),$T3		# s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) 	vpmuludq	$H3,$T4,$T0		# h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) 	vpaddq		$T0,$D1,$D1		# d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) 	vpmuludq	$H2,$T4,$T4		# h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) 	vpaddq		$T4,$D0,$D0		# d0 += h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) 	vpmuludq	$H0,$T2,$T2		# h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) 	vpaddq		$T2,$D4,$D4		# d4 += h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) 	vpmuludq	$H4,$T3,$T1		# h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) 	vpaddq		$T1,$D3,$D3		# d3 += h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) 	vpmuludq	$H3,$T3,$T0		# h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) 	vpaddq		$T0,$D2,$D2		# d2 += h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) 	vpmuludq	$H2,$T3,$T1		# h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) 	vpaddq		$T1,$D1,$D1		# d1 += h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) 	vpmuludq	$H1,$T3,$T3		# h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) 	vpaddq		$T3,$D0,$D0		# d0 += h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) .Lshort_tail_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) 	# horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) 	vpsrldq		\$8,$D4,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) 	vpsrldq		\$8,$D3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) 	vpsrldq		\$8,$D1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) 	vpsrldq		\$8,$D0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) 	vpsrldq		\$8,$D2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) 	vpaddq		$T3,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) 	vpaddq		$T4,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) 	vpaddq		$T0,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) 	vpaddq		$T1,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) 	vpaddq		$T2,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) 	# lazy reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) 	vpsrlq		\$26,$D3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) 	vpand		$MASK,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) 	vpaddq		$H3,$D4,$D4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) 	vpsrlq		\$26,$D0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) 	vpand		$MASK,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) 	vpaddq		$H0,$D1,$D1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) 	vpsrlq		\$26,$D4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) 	vpand		$MASK,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) 	vpsrlq		\$26,$D1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) 	vpand		$MASK,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) 	vpaddq		$H1,$D2,$D2		# h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) 	vpaddq		$H4,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) 	vpsllq		\$2,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) 	vpaddq		$H4,$D0,$D0		# h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) 	vpsrlq		\$26,$D2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) 	vpand		$MASK,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) 	vpaddq		$H2,$D3,$D3		# h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) 	vpsrlq		\$26,$D0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) 	vpand		$MASK,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) 	vpaddq		$H0,$D1,$D1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) 	vpsrlq		\$26,$D3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) 	vpand		$MASK,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) 	vpaddq		$H3,$D4,$D4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) 	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) 	vmovd		$D1,`4*1-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) 	vmovd		$D2,`4*2-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) 	vmovd		$D3,`4*3-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) 	vmovd		$D4,`4*4-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) $code.=<<___	if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) 	vmovdqa		0x50(%r11),%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) 	vmovdqa		0x60(%r11),%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) 	vmovdqa		0x70(%r11),%xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) 	vmovdqa		0x80(%r11),%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) 	vmovdqa		0x90(%r11),%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) 	vmovdqa		0xa0(%r11),%xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) 	vmovdqa		0xb0(%r11),%xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) 	vmovdqa		0xc0(%r11),%xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) 	vmovdqa		0xd0(%r11),%xmm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) 	vmovdqa		0xe0(%r11),%xmm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) 	lea		0xf8(%r11),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) .Ldo_avx_epilogue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) $code.=<<___	if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) 	lea		-8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) .cfi_def_cfa_register	%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) 	vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) &end_function("poly1305_blocks_avx");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) &declare_function("poly1305_emit_avx", 32, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) 	cmpl	\$0,20($ctx)	# is_base2_26?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) 	je	.Lemit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) 	mov	0($ctx),%eax	# load hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) 	mov	4($ctx),%ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) 	mov	8($ctx),%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) 	mov	12($ctx),%r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) 	mov	16($ctx),%r10d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) 	shl	\$26,%rcx	# base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) 	mov	%r8,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) 	shl	\$52,%r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) 	add	%rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) 	shr	\$12,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) 	add	%rax,%r8	# h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) 	adc	\$0,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) 	shl	\$14,%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) 	mov	%r10,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) 	shr	\$24,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) 	add	%r11,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) 	shl	\$40,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) 	add	%rax,%r9	# h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) 	adc	\$0,%r10	# h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) 	mov	%r10,%rax	# could be partially reduced, so reduce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) 	mov	%r10,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) 	and	\$3,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) 	shr	\$2,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) 	and	\$-4,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) 	add	%rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) 	add	%rax,%r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) 	adc	\$0,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) 	adc	\$0,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) 	mov	%r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) 	add	\$5,%r8		# compare to modulus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) 	mov	%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) 	adc	\$0,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) 	adc	\$0,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) 	shr	\$2,%r10	# did 130-bit value overflow?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) 	cmovnz	%r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) 	cmovnz	%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) 	add	0($nonce),%rax	# accumulate nonce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) 	adc	8($nonce),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) 	mov	%rax,0($mac)	# write result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) 	mov	%rcx,8($mac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) &end_function("poly1305_emit_avx");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) if ($avx>1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518)     map("%ymm$_",(0..15));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) my $S4=$MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) sub poly1305_blocks_avxN {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) 	my ($avx512) = @_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) 	my $suffix = $avx512 ? "_avx512" : "";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) 	mov	20($ctx),%r8d		# is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) 	cmp	\$128,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) 	jae	.Lblocks_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) 	test	%r8d,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) 	jz	.Lblocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) .Lblocks_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) 	and	\$-16,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) 	jz	.Lno_data_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) 	vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) 	test	%r8d,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) 	jz	.Lbase2_64_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) 	test	\$63,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) 	jz	.Leven_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) 	push	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) .cfi_push	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) 	mov 	%rsp,%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) 	push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) .cfi_push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) 	push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) .cfi_push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) 	push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) .cfi_push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) 	push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) .cfi_push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) 	push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) .cfi_push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) .Lblocks_avx2_body$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) 	mov	$len,%r15		# reassign $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) 	mov	0($ctx),$d1		# load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) 	mov	8($ctx),$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) 	mov	16($ctx),$h2#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) 	mov	24($ctx),$r0		# load r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) 	mov	32($ctx),$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) 	################################# base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) 	mov	$d1#d,$h0#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) 	and	\$`-1*(1<<31)`,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) 	mov	$d2,$r1			# borrow $r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) 	mov	$d2#d,$h1#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) 	and	\$`-1*(1<<31)`,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) 	shr	\$6,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) 	shl	\$52,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) 	add	$d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) 	shr	\$12,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) 	shr	\$18,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) 	add	$r1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) 	adc	$d2,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) 	mov	$h2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) 	shl	\$40,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) 	shr	\$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) 	add	$d1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) 	adc	\$0,$h2			# can be partially reduced...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) 	mov	\$-4,$d2		# ... so reduce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) 	mov	$h2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) 	and	$h2,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) 	shr	\$2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) 	and	\$3,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) 	add	$d2,$d1			# =*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) 	add	$d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) 	adc	\$0,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) 	adc	\$0,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) 	mov	$s1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) 	mov	$s1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) 	shr	\$2,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) 	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) .Lbase2_26_pre_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) 	add	0($inp),$h0		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) 	adc	8($inp),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) 	lea	16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) 	adc	$padbit,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) 	sub	\$16,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) 	call	__poly1305_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) 	mov	$r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) 	test	\$63,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) 	jnz	.Lbase2_26_pre_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) 	test	$padbit,$padbit		# if $padbit is zero,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) 	jz	.Lstore_base2_64_avx2$suffix	# store hash in base 2^64 format
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) 	################################# base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) 	mov	$h0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) 	mov	$h0,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) 	shr	\$52,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) 	mov	$h1,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) 	mov	$h1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) 	shr	\$26,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) 	and	\$0x3ffffff,%rax	# h[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) 	shl	\$12,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) 	and	\$0x3ffffff,%rdx	# h[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) 	shr	\$14,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) 	or	$r0,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) 	shl	\$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) 	and	\$0x3ffffff,$h0		# h[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) 	shr	\$40,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) 	and	\$0x3ffffff,$h1		# h[3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) 	or	$r1,$h2			# h[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) 	test	%r15,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) 	jz	.Lstore_base2_26_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) 	vmovd	%rax#d,%x#$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) 	vmovd	%rdx#d,%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) 	vmovd	$h0#d,%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) 	vmovd	$h1#d,%x#$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) 	vmovd	$h2#d,%x#$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) 	jmp	.Lproceed_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) .Lstore_base2_64_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) 	mov	$h0,0($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) 	mov	$h1,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) 	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) 	jmp	.Ldone_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) .align	16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) .Lstore_base2_26_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) 	mov	%rax#d,0($ctx)		# store hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) 	mov	%rdx#d,4($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) 	mov	$h0#d,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) 	mov	$h1#d,12($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) 	mov	$h2#d,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) .align	16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) .Ldone_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) 	pop 		%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) .cfi_restore	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) 	pop 		%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) .cfi_restore	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) 	pop 		%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) .cfi_restore	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) 	pop 		%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) .cfi_restore	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) 	pop 		%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) .cfi_restore	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) 	pop 		%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) .cfi_restore 	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) .Lno_data_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) .Lblocks_avx2_epilogue$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) .Lbase2_64_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) 	push	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) .cfi_push	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) 	mov 	%rsp,%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) 	push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) .cfi_push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) 	push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) .cfi_push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) 	push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) .cfi_push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) 	push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) .cfi_push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) 	push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) .cfi_push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) .Lbase2_64_avx2_body$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) 	mov	$len,%r15		# reassign $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) 	mov	24($ctx),$r0		# load r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) 	mov	32($ctx),$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) 	mov	0($ctx),$h0		# load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) 	mov	8($ctx),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) 	mov	16($ctx),$h2#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) 	mov	$s1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) 	mov	$s1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) 	shr	\$2,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) 	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) 	test	\$63,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) 	jz	.Linit_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) .Lbase2_64_pre_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) 	add	0($inp),$h0		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) 	adc	8($inp),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) 	lea	16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) 	adc	$padbit,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) 	sub	\$16,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) 	call	__poly1305_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) 	mov	$r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) 	test	\$63,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) 	jnz	.Lbase2_64_pre_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) .Linit_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) 	################################# base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) 	mov	$h0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) 	mov	$h0,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) 	shr	\$52,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) 	mov	$h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) 	mov	$h1,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) 	shr	\$26,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) 	and	\$0x3ffffff,%rax	# h[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) 	shl	\$12,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) 	and	\$0x3ffffff,%rdx	# h[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) 	shr	\$14,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) 	or	$d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) 	shl	\$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) 	and	\$0x3ffffff,$h0		# h[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) 	shr	\$40,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) 	and	\$0x3ffffff,$h1		# h[3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) 	or	$d2,$h2			# h[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) 	vmovd	%rax#d,%x#$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) 	vmovd	%rdx#d,%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) 	vmovd	$h0#d,%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) 	vmovd	$h1#d,%x#$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) 	vmovd	$h2#d,%x#$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) 	movl	\$1,20($ctx)		# set is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) 	call	__poly1305_init_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) .Lproceed_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) 	mov	%r15,$len			# restore $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) $code.=<<___ if (!$kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) 	mov	OPENSSL_ia32cap_P+8(%rip),%r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) 	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) 	pop 		%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) .cfi_restore	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) 	pop 		%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) .cfi_restore	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) 	pop 		%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) .cfi_restore	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) 	pop 		%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) .cfi_restore	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) 	pop 		%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) .cfi_restore	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) 	pop 		%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) .cfi_restore 	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) .Lbase2_64_avx2_epilogue$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) 	jmp	.Ldo_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) .Leven_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) $code.=<<___ if (!$kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) 	mov		OPENSSL_ia32cap_P+8(%rip),%r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) 	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) 	vmovd		4*1($ctx),%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) 	vmovd		4*2($ctx),%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) 	vmovd		4*3($ctx),%x#$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) 	vmovd		4*4($ctx),%x#$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) .Ldo_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) $code.=<<___		if (!$kernel && $avx>2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) 	cmp		\$512,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) 	jb		.Lskip_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) 	and		%r11d,%r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) 	test		\$`1<<16`,%r9d		# check for AVX512F
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) 	jnz		.Lblocks_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) .Lskip_avx512$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) $code.=<<___ if ($avx > 2 && $avx512 && $kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) 	cmp		\$512,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) 	jae		.Lblocks_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) $code.=<<___	if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) 	lea		8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) .cfi_def_cfa_register	%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) 	sub		\$0x128,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) $code.=<<___	if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) 	lea		8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) 	sub		\$0x1c8,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) 	vmovdqa		%xmm6,-0xb0(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) 	vmovdqa		%xmm7,-0xa0(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) 	vmovdqa		%xmm8,-0x90(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) 	vmovdqa		%xmm9,-0x80(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) 	vmovdqa		%xmm10,-0x70(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) 	vmovdqa		%xmm11,-0x60(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) 	vmovdqa		%xmm12,-0x50(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) 	vmovdqa		%xmm13,-0x40(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) 	vmovdqa		%xmm14,-0x30(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) 	vmovdqa		%xmm15,-0x20(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) .Ldo_avx2_body$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) 	lea		.Lconst(%rip),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) 	lea		48+64($ctx),$ctx	# size optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) 	vmovdqa		96(%rcx),$T0		# .Lpermd_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) 	# expand and copy pre-calculated table to stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) 	vmovdqu		`16*0-64`($ctx),%x#$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) 	and		\$-512,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) 	vmovdqu		`16*1-64`($ctx),%x#$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) 	vmovdqu		`16*2-64`($ctx),%x#$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) 	vmovdqu		`16*3-64`($ctx),%x#$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) 	vmovdqu		`16*4-64`($ctx),%x#$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) 	vmovdqu		`16*5-64`($ctx),%x#$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) 	lea		0x90(%rsp),%rax		# size optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) 	vmovdqu		`16*6-64`($ctx),%x#$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) 	vpermd		$T2,$T0,$T2		# 00003412 -> 14243444
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) 	vmovdqu		`16*7-64`($ctx),%x#$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) 	vpermd		$T3,$T0,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) 	vmovdqu		`16*8-64`($ctx),%x#$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) 	vpermd		$T4,$T0,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) 	vmovdqa		$T2,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) 	vpermd		$D0,$T0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) 	vmovdqa		$T3,0x20-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) 	vpermd		$D1,$T0,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) 	vmovdqa		$T4,0x40-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) 	vpermd		$D2,$T0,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) 	vmovdqa		$D0,0x60-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) 	vpermd		$D3,$T0,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) 	vmovdqa		$D1,0x80-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) 	vpermd		$D4,$T0,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) 	vmovdqa		$D2,0xa0-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) 	vpermd		$MASK,$T0,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) 	vmovdqa		$D3,0xc0-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) 	vmovdqa		$D4,0xe0-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) 	vmovdqa		$MASK,0x100-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) 	vmovdqa		64(%rcx),$MASK		# .Lmask26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) 	# load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) 	vmovdqu		16*0($inp),%x#$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) 	vmovdqu		16*1($inp),%x#$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) 	vinserti128	\$1,16*2($inp),$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) 	vinserti128	\$1,16*3($inp),$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) 	lea		16*4($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) 	vpsrldq		\$6,$T0,$T2		# splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) 	vpsrldq		\$6,$T1,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) 	vpunpckhqdq	$T1,$T0,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) 	vpunpcklqdq	$T3,$T2,$T2		# 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) 	vpunpcklqdq	$T1,$T0,$T0		# 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) 	vpsrlq		\$30,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) 	vpsrlq		\$4,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) 	vpsrlq		\$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) 	vpsrlq		\$40,$T4,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) 	vpand		$MASK,$T2,$T2		# 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) 	vpand		$MASK,$T0,$T0		# 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) 	vpand		$MASK,$T1,$T1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) 	vpand		$MASK,$T3,$T3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) 	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) 	vpaddq		$H2,$T2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) 	sub		\$64,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) 	jz		.Ltail_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) 	jmp		.Loop_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) .Loop_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) 	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) 	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) 	# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) 	# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) 	#   \________/\__________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) 	#vpaddq		$H2,$T2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) 	vpaddq		$H0,$T0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) 	vmovdqa		`32*0`(%rsp),$T0	# r0^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) 	vpaddq		$H1,$T1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) 	vmovdqa		`32*1`(%rsp),$T1	# r1^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) 	vpaddq		$H3,$T3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) 	vmovdqa		`32*3`(%rsp),$T2	# r2^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) 	vpaddq		$H4,$T4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) 	vmovdqa		`32*6-0x90`(%rax),$T3	# s3^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) 	vmovdqa		`32*8-0x90`(%rax),$S4	# s4^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) 	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) 	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) 	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) 	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) 	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) 	#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) 	# however, as h2 is "chronologically" first one available pull
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) 	# corresponding operations up, so it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) 	#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) 	# d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) 	# d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) 	# d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) 	# d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) 	# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) 	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) 	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) 	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) 	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) 	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) 	vpmuludq	$H0,$T1,$T4		# h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) 	vpmuludq	$H1,$T1,$H2		# h1*r1, borrow $H2 as temp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) 	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) 	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) 	vpmuludq	$H3,$T1,$T4		# h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) 	vpmuludq	`32*2`(%rsp),$H4,$H2	# h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) 	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) 	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) 	 vmovdqa	`32*4-0x90`(%rax),$T1	# s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) 	vpmuludq	$H0,$T0,$T4		# h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) 	vpmuludq	$H1,$T0,$H2		# h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) 	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) 	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) 	vpmuludq	$H3,$T0,$T4		# h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) 	vpmuludq	$H4,$T0,$H2		# h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) 	 vmovdqu	16*0($inp),%x#$T0	# load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) 	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) 	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) 	 vinserti128	\$1,16*2($inp),$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) 	vpmuludq	$H3,$T1,$T4		# h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) 	vpmuludq	$H4,$T1,$H2		# h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) 	 vmovdqu	16*1($inp),%x#$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) 	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) 	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) 	 vmovdqa	`32*5-0x90`(%rax),$H2	# r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) 	vpmuludq	$H1,$T2,$T4		# h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) 	vpmuludq	$H0,$T2,$T2		# h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) 	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) 	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) 	 vinserti128	\$1,16*3($inp),$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) 	 lea		16*4($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) 	vpmuludq	$H1,$H2,$T4		# h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) 	vpmuludq	$H0,$H2,$H2		# h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) 	 vpsrldq	\$6,$T0,$T2		# splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) 	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) 	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) 	vpmuludq	$H3,$T3,$T4		# h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) 	vpmuludq	$H4,$T3,$H2		# h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) 	 vpsrldq	\$6,$T1,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) 	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) 	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) 	 vpunpckhqdq	$T1,$T0,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) 	vpmuludq	$H3,$S4,$H3		# h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) 	vpmuludq	$H4,$S4,$H4		# h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) 	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) 	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) 	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) 	 vpunpcklqdq	$T3,$T2,$T3		# 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) 	vpmuludq	`32*7-0x90`(%rax),$H0,$H4	# h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) 	vpmuludq	$H1,$S4,$H0		# h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) 	vmovdqa		64(%rcx),$MASK		# .Lmask26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) 	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) 	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) 	# lazy reduction (interleaved with tail of input splat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) 	vpsrlq		\$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) 	vpand		$MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) 	vpaddq		$D3,$H4,$H4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) 	vpsrlq		\$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) 	vpand		$MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) 	vpaddq		$D0,$D1,$H1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) 	vpsrlq		\$26,$H4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) 	vpand		$MASK,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) 	 vpsrlq		\$4,$T3,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) 	vpsrlq		\$26,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) 	vpand		$MASK,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) 	vpaddq		$D1,$H2,$H2		# h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) 	vpaddq		$D4,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) 	vpsllq		\$2,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) 	vpaddq		$D4,$H0,$H0		# h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) 	 vpand		$MASK,$T2,$T2		# 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) 	 vpsrlq		\$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) 	vpsrlq		\$26,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) 	vpand		$MASK,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) 	vpaddq		$D2,$H3,$H3		# h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) 	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) 	 vpsrlq		\$30,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) 	vpsrlq		\$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) 	vpand		$MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) 	vpaddq		$D0,$H1,$H1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) 	 vpsrlq		\$40,$T4,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) 	vpsrlq		\$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) 	vpand		$MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) 	vpaddq		$D3,$H4,$H4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) 	 vpand		$MASK,$T0,$T0		# 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) 	 vpand		$MASK,$T1,$T1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) 	 vpand		$MASK,$T3,$T3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) 	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) 	sub		\$64,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) 	jnz		.Loop_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) 	.byte		0x66,0x90
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) .Ltail_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) 	# while above multiplications were by r^4 in all lanes, in last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) 	# iteration we multiply least significant lane by r^4 and most
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) 	# significant one by r, so copy of above except that references
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) 	# to the precomputed table are displaced by 4...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) 	#vpaddq		$H2,$T2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) 	vpaddq		$H0,$T0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) 	vmovdqu		`32*0+4`(%rsp),$T0	# r0^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) 	vpaddq		$H1,$T1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) 	vmovdqu		`32*1+4`(%rsp),$T1	# r1^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) 	vpaddq		$H3,$T3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) 	vmovdqu		`32*3+4`(%rsp),$T2	# r2^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) 	vpaddq		$H4,$T4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) 	vmovdqu		`32*6+4-0x90`(%rax),$T3	# s3^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) 	vmovdqu		`32*8+4-0x90`(%rax),$S4	# s4^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) 	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) 	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) 	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) 	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) 	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) 	vpmuludq	$H0,$T1,$T4		# h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) 	vpmuludq	$H1,$T1,$H2		# h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) 	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) 	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) 	vpmuludq	$H3,$T1,$T4		# h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) 	vpmuludq	`32*2+4`(%rsp),$H4,$H2	# h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) 	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) 	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) 	vpmuludq	$H0,$T0,$T4		# h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) 	vpmuludq	$H1,$T0,$H2		# h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) 	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) 	 vmovdqu	`32*4+4-0x90`(%rax),$T1	# s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) 	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) 	vpmuludq	$H3,$T0,$T4		# h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) 	vpmuludq	$H4,$T0,$H2		# h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) 	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) 	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) 	vpmuludq	$H3,$T1,$T4		# h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) 	vpmuludq	$H4,$T1,$H2		# h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) 	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) 	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) 	 vmovdqu	`32*5+4-0x90`(%rax),$H2	# r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) 	vpmuludq	$H1,$T2,$T4		# h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) 	vpmuludq	$H0,$T2,$T2		# h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) 	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) 	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) 	vpmuludq	$H1,$H2,$T4		# h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) 	vpmuludq	$H0,$H2,$H2		# h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) 	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) 	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) 	vpmuludq	$H3,$T3,$T4		# h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) 	vpmuludq	$H4,$T3,$H2		# h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) 	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) 	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) 	vpmuludq	$H3,$S4,$H3		# h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) 	vpmuludq	$H4,$S4,$H4		# h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) 	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) 	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) 	vpmuludq	`32*7+4-0x90`(%rax),$H0,$H4		# h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) 	vpmuludq	$H1,$S4,$H0		# h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) 	vmovdqa		64(%rcx),$MASK		# .Lmask26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) 	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) 	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) 	# horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) 	vpsrldq		\$8,$D1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) 	vpsrldq		\$8,$H2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) 	vpsrldq		\$8,$H3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) 	vpsrldq		\$8,$H4,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) 	vpsrldq		\$8,$H0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) 	vpaddq		$T1,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) 	vpaddq		$T2,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) 	vpaddq		$T3,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) 	vpaddq		$T4,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) 	vpaddq		$T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) 	vpermq		\$0x2,$H3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) 	vpermq		\$0x2,$H4,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) 	vpermq		\$0x2,$H0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) 	vpermq		\$0x2,$D1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) 	vpermq		\$0x2,$H2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) 	vpaddq		$T3,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) 	vpaddq		$T4,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) 	vpaddq		$T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) 	vpaddq		$T1,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) 	vpaddq		$T2,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) 	# lazy reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) 	vpsrlq		\$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) 	vpand		$MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) 	vpaddq		$D3,$H4,$H4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) 	vpsrlq		\$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) 	vpand		$MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) 	vpaddq		$D0,$D1,$H1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) 	vpsrlq		\$26,$H4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) 	vpand		$MASK,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) 	vpsrlq		\$26,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) 	vpand		$MASK,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) 	vpaddq		$D1,$H2,$H2		# h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) 	vpaddq		$D4,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) 	vpsllq		\$2,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) 	vpaddq		$D4,$H0,$H0		# h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) 	vpsrlq		\$26,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) 	vpand		$MASK,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) 	vpaddq		$D2,$H3,$H3		# h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) 	vpsrlq		\$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) 	vpand		$MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) 	vpaddq		$D0,$H1,$H1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) 	vpsrlq		\$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) 	vpand		$MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) 	vpaddq		$D3,$H4,$H4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) 	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) 	vmovd		%x#$H1,`4*1-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) 	vmovd		%x#$H2,`4*2-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) 	vmovd		%x#$H3,`4*3-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) 	vmovd		%x#$H4,`4*4-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) $code.=<<___	if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) 	vmovdqa		-0xb0(%r10),%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) 	vmovdqa		-0xa0(%r10),%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) 	vmovdqa		-0x90(%r10),%xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) 	vmovdqa		-0x80(%r10),%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) 	vmovdqa		-0x70(%r10),%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) 	vmovdqa		-0x60(%r10),%xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) 	vmovdqa		-0x50(%r10),%xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) 	vmovdqa		-0x40(%r10),%xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) 	vmovdqa		-0x30(%r10),%xmm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) 	vmovdqa		-0x20(%r10),%xmm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) 	lea		-8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) .Ldo_avx2_epilogue$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) $code.=<<___	if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) 	lea		-8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) .cfi_def_cfa_register	%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) 	vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) if($avx > 2 && $avx512) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) my $PADBIT="%zmm30";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));		# switch to %zmm domain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) map(s/%y/%z/,($MASK));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) .Lblocks_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) 	mov		\$15,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) 	kmovw		%eax,%k2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) $code.=<<___	if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) 	lea		8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) .cfi_def_cfa_register	%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) 	sub		\$0x128,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) $code.=<<___	if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) 	lea		8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) 	sub		\$0x1c8,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) 	vmovdqa		%xmm6,-0xb0(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) 	vmovdqa		%xmm7,-0xa0(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) 	vmovdqa		%xmm8,-0x90(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) 	vmovdqa		%xmm9,-0x80(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) 	vmovdqa		%xmm10,-0x70(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) 	vmovdqa		%xmm11,-0x60(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) 	vmovdqa		%xmm12,-0x50(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) 	vmovdqa		%xmm13,-0x40(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) 	vmovdqa		%xmm14,-0x30(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) 	vmovdqa		%xmm15,-0x20(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) .Ldo_avx512_body:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) 	lea		.Lconst(%rip),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) 	lea		48+64($ctx),$ctx	# size optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) 	vmovdqa		96(%rcx),%y#$T2		# .Lpermd_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) 	# expand pre-calculated table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) 	vmovdqu		`16*0-64`($ctx),%x#$D0	# will become expanded ${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) 	and		\$-512,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) 	vmovdqu		`16*1-64`($ctx),%x#$D1	# will become ... ${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) 	mov		\$0x20,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) 	vmovdqu		`16*2-64`($ctx),%x#$T0	# ... ${S1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) 	vmovdqu		`16*3-64`($ctx),%x#$D2	# ... ${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) 	vmovdqu		`16*4-64`($ctx),%x#$T1	# ... ${S2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) 	vmovdqu		`16*5-64`($ctx),%x#$D3	# ... ${R3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) 	vmovdqu		`16*6-64`($ctx),%x#$T3	# ... ${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) 	vmovdqu		`16*7-64`($ctx),%x#$D4	# ... ${R4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) 	vmovdqu		`16*8-64`($ctx),%x#$T4	# ... ${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) 	vpermd		$D0,$T2,$R0		# 00003412 -> 14243444
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) 	vpbroadcastq	64(%rcx),$MASK		# .Lmask26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) 	vpermd		$D1,$T2,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) 	vpermd		$T0,$T2,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) 	vpermd		$D2,$T2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) 	vmovdqa64	$R0,0x00(%rsp){%k2}	# save in case $len%128 != 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) 	 vpsrlq		\$32,$R0,$T0		# 14243444 -> 01020304
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) 	vpermd		$T1,$T2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) 	vmovdqu64	$R1,0x00(%rsp,%rax){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) 	 vpsrlq		\$32,$R1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) 	vpermd		$D3,$T2,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) 	vmovdqa64	$S1,0x40(%rsp){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) 	vpermd		$T3,$T2,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) 	vpermd		$D4,$T2,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) 	vmovdqu64	$R2,0x40(%rsp,%rax){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) 	vpermd		$T4,$T2,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) 	vmovdqa64	$S2,0x80(%rsp){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) 	vmovdqu64	$R3,0x80(%rsp,%rax){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) 	vmovdqa64	$S3,0xc0(%rsp){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) 	vmovdqu64	$R4,0xc0(%rsp,%rax){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) 	vmovdqa64	$S4,0x100(%rsp){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) 	# calculate 5th through 8th powers of the key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) 	#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) 	# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) 	# d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) 	# d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) 	# d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) 	# d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) 	vpmuludq	$T0,$R0,$D0		# d0 = r0'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) 	vpmuludq	$T0,$R1,$D1		# d1 = r0'*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) 	vpmuludq	$T0,$R2,$D2		# d2 = r0'*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) 	vpmuludq	$T0,$R3,$D3		# d3 = r0'*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) 	vpmuludq	$T0,$R4,$D4		# d4 = r0'*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) 	 vpsrlq		\$32,$R2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) 	vpmuludq	$T1,$S4,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) 	vpmuludq	$T1,$R0,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) 	vpmuludq	$T1,$R1,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) 	vpmuludq	$T1,$R2,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) 	vpmuludq	$T1,$R3,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) 	 vpsrlq		\$32,$R3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) 	vpaddq		$M0,$D0,$D0		# d0 += r1'*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) 	vpaddq		$M1,$D1,$D1		# d1 += r1'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) 	vpaddq		$M2,$D2,$D2		# d2 += r1'*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) 	vpaddq		$M3,$D3,$D3		# d3 += r1'*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) 	vpaddq		$M4,$D4,$D4		# d4 += r1'*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) 	vpmuludq	$T2,$S3,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) 	vpmuludq	$T2,$S4,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) 	vpmuludq	$T2,$R1,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) 	vpmuludq	$T2,$R2,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) 	vpmuludq	$T2,$R0,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) 	 vpsrlq		\$32,$R4,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) 	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) 	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) 	vpaddq		$M3,$D3,$D3		# d3 += r2'*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) 	vpaddq		$M4,$D4,$D4		# d4 += r2'*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) 	vpaddq		$M2,$D2,$D2		# d2 += r2'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) 	vpmuludq	$T3,$S2,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) 	vpmuludq	$T3,$R0,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) 	vpmuludq	$T3,$R1,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) 	vpmuludq	$T3,$S3,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) 	vpmuludq	$T3,$S4,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) 	vpaddq		$M0,$D0,$D0		# d0 += r3'*5*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) 	vpaddq		$M3,$D3,$D3		# d3 += r3'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) 	vpaddq		$M4,$D4,$D4		# d4 += r3'*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) 	vpaddq		$M1,$D1,$D1		# d1 += r3'*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) 	vpaddq		$M2,$D2,$D2		# d2 += r3'*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) 	vpmuludq	$T4,$S4,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) 	vpmuludq	$T4,$R0,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) 	vpmuludq	$T4,$S1,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) 	vpmuludq	$T4,$S2,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) 	vpmuludq	$T4,$S3,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) 	vpaddq		$M3,$D3,$D3		# d3 += r2'*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) 	vpaddq		$M4,$D4,$D4		# d4 += r2'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) 	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) 	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) 	vpaddq		$M2,$D2,$D2		# d2 += r2'*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) 	# load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) 	vmovdqu64	16*0($inp),%z#$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) 	vmovdqu64	16*4($inp),%z#$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) 	lea		16*8($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) 	# lazy reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) 	vpsrlq		\$26,$D3,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) 	vpandq		$MASK,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) 	vpaddq		$M3,$D4,$D4		# d3 -> d4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) 	vpsrlq		\$26,$D0,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) 	vpandq		$MASK,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) 	vpaddq		$M0,$D1,$D1		# d0 -> d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) 	vpsrlq		\$26,$D4,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) 	vpandq		$MASK,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) 	vpsrlq		\$26,$D1,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) 	vpandq		$MASK,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) 	vpaddq		$M1,$D2,$D2		# d1 -> d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) 	vpaddq		$M4,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) 	vpsllq		\$2,$M4,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) 	vpaddq		$M4,$D0,$D0		# d4 -> d0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) 	vpsrlq		\$26,$D2,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) 	vpandq		$MASK,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) 	vpaddq		$M2,$D3,$D3		# d2 -> d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) 	vpsrlq		\$26,$D0,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) 	vpandq		$MASK,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) 	vpaddq		$M0,$D1,$D1		# d0 -> d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) 	vpsrlq		\$26,$D3,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) 	vpandq		$MASK,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) 	vpaddq		$M3,$D4,$D4		# d3 -> d4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) 	# at this point we have 14243444 in $R0-$S4 and 05060708 in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) 	# $D0-$D4, ...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) 	vpunpcklqdq	$T4,$T3,$T0	# transpose input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) 	vpunpckhqdq	$T4,$T3,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) 	# ... since input 64-bit lanes are ordered as 73625140, we could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) 	# "vperm" it to 76543210 (here and in each loop iteration), *or*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) 	# we could just flow along, hence the goal for $R0-$S4 is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) 	# 1858286838784888 ...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) 	vmovdqa32	128(%rcx),$M0		# .Lpermd_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) 	mov		\$0x7777,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) 	kmovw		%eax,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) 	vpermd		$R0,$M0,$R0		# 14243444 -> 1---2---3---4---
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) 	vpermd		$R1,$M0,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) 	vpermd		$R2,$M0,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) 	vpermd		$R3,$M0,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) 	vpermd		$R4,$M0,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) 	vpermd		$D0,$M0,${R0}{%k1}	# 05060708 -> 1858286838784888
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) 	vpermd		$D1,$M0,${R1}{%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) 	vpermd		$D2,$M0,${R2}{%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) 	vpermd		$D3,$M0,${R3}{%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) 	vpermd		$D4,$M0,${R4}{%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) 	vpslld		\$2,$R1,$S1		# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) 	vpslld		\$2,$R2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) 	vpslld		\$2,$R3,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) 	vpslld		\$2,$R4,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) 	vpaddd		$R1,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) 	vpaddd		$R2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) 	vpaddd		$R3,$S3,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) 	vpaddd		$R4,$S4,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) 	vpbroadcastq	32(%rcx),$PADBIT	# .L129
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) 	vpsrlq		\$52,$T0,$T2		# splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) 	vpsllq		\$12,$T4,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) 	vporq		$T3,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) 	vpsrlq		\$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) 	vpsrlq		\$14,$T4,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) 	vpsrlq		\$40,$T4,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) 	vpandq		$MASK,$T2,$T2		# 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) 	vpandq		$MASK,$T0,$T0		# 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) 	#vpandq		$MASK,$T1,$T1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) 	#vpandq		$MASK,$T3,$T3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) 	#vporq		$PADBIT,$T4,$T4		# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) 	vpaddq		$H2,$T2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) 	sub		\$192,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) 	jbe		.Ltail_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) 	jmp		.Loop_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) .Loop_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) 	# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) 	# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) 	# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) 	# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) 	# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) 	# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) 	# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) 	# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) 	#   \________/\___________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) 	#vpaddq		$H2,$T2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) 	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) 	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) 	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) 	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) 	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) 	#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) 	# however, as h2 is "chronologically" first one available pull
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) 	# corresponding operations up, so it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) 	#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) 	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) 	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) 	# d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) 	# d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) 	# d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) 	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) 	 vpaddq		$H0,$T0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) 	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) 	 vpandq		$MASK,$T1,$T1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) 	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) 	 vpandq		$MASK,$T3,$T3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) 	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) 	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) 	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) 	 vpaddq		$H1,$T1,$H1		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) 	 vpaddq		$H3,$T3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) 	 vpaddq		$H4,$T4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) 	  vmovdqu64	16*0($inp),$T3		# load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) 	  vmovdqu64	16*4($inp),$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) 	  lea		16*8($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) 	vpmuludq	$H0,$R3,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) 	vpmuludq	$H0,$R4,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) 	vpmuludq	$H0,$R0,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) 	vpmuludq	$H0,$R1,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) 	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) 	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) 	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) 	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) 	vpmuludq	$H1,$R2,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) 	vpmuludq	$H1,$R3,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) 	vpmuludq	$H1,$S4,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) 	vpmuludq	$H0,$R2,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) 	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) 	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) 	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) 	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) 	  vpunpcklqdq	$T4,$T3,$T0		# transpose input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) 	  vpunpckhqdq	$T4,$T3,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) 	vpmuludq	$H3,$R0,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) 	vpmuludq	$H3,$R1,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) 	vpmuludq	$H1,$R0,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) 	vpmuludq	$H1,$R1,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) 	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) 	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) 	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) 	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) 	vpmuludq	$H4,$S4,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) 	vpmuludq	$H4,$R0,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) 	vpmuludq	$H3,$S2,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) 	vpmuludq	$H3,$S3,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) 	vpaddq		$M3,$D3,$D3		# d3 += h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) 	vpmuludq	$H3,$S4,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) 	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) 	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) 	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) 	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) 	vpmuludq	$H4,$S1,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) 	vpmuludq	$H4,$S2,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) 	vpmuludq	$H4,$S3,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) 	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) 	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) 	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) 	# lazy reduction (interleaved with input splat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) 	 vpsrlq		\$52,$T0,$T2		# splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) 	 vpsllq		\$12,$T4,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) 	vpsrlq		\$26,$D3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) 	vpandq		$MASK,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) 	vpaddq		$H3,$D4,$H4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) 	 vporq		$T3,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) 	vpsrlq		\$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) 	vpandq		$MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) 	vpaddq		$D0,$H1,$H1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) 	 vpandq		$MASK,$T2,$T2		# 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) 	vpsrlq		\$26,$H4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) 	vpandq		$MASK,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) 	vpsrlq		\$26,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) 	vpandq		$MASK,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) 	vpaddq		$D1,$H2,$H2		# h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) 	vpaddq		$D4,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) 	vpsllq		\$2,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) 	vpaddq		$D4,$H0,$H0		# h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) 	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) 	 vpsrlq		\$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) 	vpsrlq		\$26,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) 	vpandq		$MASK,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) 	vpaddq		$D2,$D3,$H3		# h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) 	 vpsrlq		\$14,$T4,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) 	vpsrlq		\$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) 	vpandq		$MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) 	vpaddq		$D0,$H1,$H1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) 	 vpsrlq		\$40,$T4,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) 	vpsrlq		\$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) 	vpandq		$MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) 	vpaddq		$D3,$H4,$H4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) 	 vpandq		$MASK,$T0,$T0		# 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) 	 #vpandq	$MASK,$T1,$T1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) 	 #vpandq	$MASK,$T3,$T3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) 	 #vporq		$PADBIT,$T4,$T4		# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) 	sub		\$128,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) 	ja		.Loop_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) .Ltail_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) 	# while above multiplications were by r^8 in all lanes, in last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) 	# iteration we multiply least significant lane by r^8 and most
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) 	# significant one by r, that's why table gets shifted...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) 	vpsrlq		\$32,$R0,$R0		# 0105020603070408
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) 	vpsrlq		\$32,$R1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) 	vpsrlq		\$32,$R2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) 	vpsrlq		\$32,$S3,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) 	vpsrlq		\$32,$S4,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) 	vpsrlq		\$32,$R3,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) 	vpsrlq		\$32,$R4,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) 	vpsrlq		\$32,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) 	vpsrlq		\$32,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) 	# load either next or last 64 byte of input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) 	lea		($inp,$len),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) 	#vpaddq		$H2,$T2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) 	vpaddq		$H0,$T0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) 	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) 	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) 	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) 	 vpandq		$MASK,$T1,$T1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) 	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) 	 vpandq		$MASK,$T3,$T3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) 	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) 	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) 	 vpaddq		$H1,$T1,$H1		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) 	 vpaddq		$H3,$T3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) 	 vpaddq		$H4,$T4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) 	  vmovdqu	16*0($inp),%x#$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) 	vpmuludq	$H0,$R3,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) 	vpmuludq	$H0,$R4,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) 	vpmuludq	$H0,$R0,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) 	vpmuludq	$H0,$R1,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) 	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) 	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) 	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) 	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) 	  vmovdqu	16*1($inp),%x#$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) 	vpmuludq	$H1,$R2,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) 	vpmuludq	$H1,$R3,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) 	vpmuludq	$H1,$S4,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) 	vpmuludq	$H0,$R2,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) 	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) 	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) 	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) 	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645) 	  vinserti128	\$1,16*2($inp),%y#$T0,%y#$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) 	vpmuludq	$H3,$R0,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) 	vpmuludq	$H3,$R1,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) 	vpmuludq	$H1,$R0,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) 	vpmuludq	$H1,$R1,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) 	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) 	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) 	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) 	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) 	  vinserti128	\$1,16*3($inp),%y#$T1,%y#$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) 	vpmuludq	$H4,$S4,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) 	vpmuludq	$H4,$R0,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) 	vpmuludq	$H3,$S2,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) 	vpmuludq	$H3,$S3,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) 	vpmuludq	$H3,$S4,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) 	vpaddq		$M3,$D3,$H3		# h3 = d3 + h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) 	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) 	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) 	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) 	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) 	vpmuludq	$H4,$S1,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) 	vpmuludq	$H4,$S2,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) 	vpmuludq	$H4,$S3,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) 	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) 	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) 	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) 	# horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) 	mov		\$1,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) 	vpermq		\$0xb1,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) 	vpermq		\$0xb1,$D4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) 	vpermq		\$0xb1,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) 	vpermq		\$0xb1,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) 	vpermq		\$0xb1,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) 	vpaddq		$D3,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) 	vpaddq		$D4,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) 	vpaddq		$D0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) 	vpaddq		$D1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) 	vpaddq		$D2,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) 	kmovw		%eax,%k3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) 	vpermq		\$0x2,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) 	vpermq		\$0x2,$H4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) 	vpermq		\$0x2,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) 	vpermq		\$0x2,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) 	vpermq		\$0x2,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) 	vpaddq		$D3,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) 	vpaddq		$D4,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) 	vpaddq		$D0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) 	vpaddq		$D1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) 	vpaddq		$D2,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) 	vextracti64x4	\$0x1,$H3,%y#$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) 	vextracti64x4	\$0x1,$H4,%y#$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) 	vextracti64x4	\$0x1,$H0,%y#$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) 	vextracti64x4	\$0x1,$H1,%y#$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) 	vextracti64x4	\$0x1,$H2,%y#$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) 	vpaddq		$D3,$H3,${H3}{%k3}{z}	# keep single qword in case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) 	vpaddq		$D4,$H4,${H4}{%k3}{z}	# it's passed to .Ltail_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) 	vpaddq		$D0,$H0,${H0}{%k3}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) 	vpaddq		$D1,$H1,${H1}{%k3}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) 	vpaddq		$D2,$H2,${H2}{%k3}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) 	# lazy reduction (interleaved with input splat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) 	vpsrlq		\$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) 	vpand		$MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) 	 vpsrldq	\$6,$T0,$T2		# splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) 	 vpsrldq	\$6,$T1,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) 	 vpunpckhqdq	$T1,$T0,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) 	vpaddq		$D3,$H4,$H4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) 	vpsrlq		\$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) 	vpand		$MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) 	 vpunpcklqdq	$T3,$T2,$T2		# 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) 	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) 	vpaddq		$D0,$H1,$H1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) 	vpsrlq		\$26,$H4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) 	vpand		$MASK,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) 	vpsrlq		\$26,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) 	vpand		$MASK,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) 	 vpsrlq		\$30,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) 	 vpsrlq		\$4,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) 	vpaddq		$D1,$H2,$H2		# h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) 	vpaddq		$D4,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) 	vpsllq		\$2,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) 	 vpsrlq		\$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) 	 vpsrlq		\$40,$T4,$T4		# 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) 	vpaddq		$D4,$H0,$H0		# h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) 	vpsrlq		\$26,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) 	vpand		$MASK,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) 	 vpand		$MASK,$T2,$T2		# 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) 	 vpand		$MASK,$T0,$T0		# 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) 	vpaddq		$D2,$H3,$H3		# h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) 	vpsrlq		\$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) 	vpand		$MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) 	 vpaddq		$H2,$T2,$H2		# accumulate input for .Ltail_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) 	 vpand		$MASK,$T1,$T1		# 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) 	vpaddq		$D0,$H1,$H1		# h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) 	vpsrlq		\$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) 	vpand		$MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) 	 vpand		$MASK,$T3,$T3		# 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) 	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) 	vpaddq		$D3,$H4,$H4		# h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) 	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) 	add		\$64,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) 	jnz		.Ltail_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) 	vpsubq		$T2,$H2,$H2		# undo input accumulation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) 	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) 	vmovd		%x#$H1,`4*1-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) 	vmovd		%x#$H2,`4*2-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) 	vmovd		%x#$H3,`4*3-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) 	vmovd		%x#$H4,`4*4-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) 	vzeroall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) $code.=<<___	if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) 	movdqa		-0xb0(%r10),%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) 	movdqa		-0xa0(%r10),%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) 	movdqa		-0x90(%r10),%xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) 	movdqa		-0x80(%r10),%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) 	movdqa		-0x70(%r10),%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) 	movdqa		-0x60(%r10),%xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) 	movdqa		-0x50(%r10),%xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) 	movdqa		-0x40(%r10),%xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) 	movdqa		-0x30(%r10),%xmm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) 	movdqa		-0x20(%r10),%xmm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) 	lea		-8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) .Ldo_avx512_epilogue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) $code.=<<___	if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) 	lea		-8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) .cfi_def_cfa_register	%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) &declare_function("poly1305_blocks_avx2", 32, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) poly1305_blocks_avxN(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) &end_function("poly1305_blocks_avx2");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) #######################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) if ($avx>2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) # On entry we have input length divisible by 64. But since inner loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) # processes 128 bytes per iteration, cases when length is not divisible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) # reason stack layout is kept identical to poly1305_blocks_avx2. If not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) # for this tail, we wouldn't have to even allocate stack frame...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) if($kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) 	$code .= "#ifdef CONFIG_AS_AVX512\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) &declare_function("poly1305_blocks_avx512", 32, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) poly1305_blocks_avxN(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) &end_function("poly1305_blocks_avx512");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) if ($kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) 	$code .= "#endif\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) if (!$kernel && $avx>3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) # VPMADD52 version using 2^44 radix.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) # One can argue that base 2^52 would be more natural. Well, even though
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) # some operations would be more natural, one has to recognize couple of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) # at amount of multiply-n-accumulate operations. Secondly, it makes it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) # reference implementations], which means that more such operations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) # would have to be performed in inner loop, which in turn makes critical
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) # path longer. In other words, even though base 2^44 reduction might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) # look less elegant, overall critical path is actually shorter...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) # Layout of opaque area is following.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) #	unsigned __int64 h[3];		# current hash value base 2^44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) #	unsigned __int64 s[2];		# key value*20 base 2^44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) #	unsigned __int64 r[3];		# key value base 2^44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) #	struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) #					# r^n positions reflect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) #					# placement in register, not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) #					# memory, R[3] is R[1]*20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) .type	poly1305_init_base2_44,\@function,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) poly1305_init_base2_44:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) 	xor	%eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) 	mov	%rax,0($ctx)		# initialize hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) 	mov	%rax,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) 	mov	%rax,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) .Linit_base2_44:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) 	lea	poly1305_blocks_vpmadd52(%rip),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) 	lea	poly1305_emit_base2_44(%rip),%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) 	mov	\$0x0ffffffc0fffffff,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) 	mov	\$0x0ffffffc0ffffffc,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) 	and	0($inp),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) 	mov	\$0x00000fffffffffff,%r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) 	and	8($inp),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) 	mov	\$0x00000fffffffffff,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) 	and	%rax,%r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) 	shrd	\$44,%rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) 	mov	%r8,40($ctx)		# r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) 	and	%r9,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) 	shr	\$24,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) 	mov	%rax,48($ctx)		# r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) 	lea	(%rax,%rax,4),%rax	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) 	mov	%rcx,56($ctx)		# r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) 	shl	\$2,%rax		# magic <<2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) 	lea	(%rcx,%rcx,4),%rcx	# *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) 	shl	\$2,%rcx		# magic <<2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882) 	mov	%rax,24($ctx)		# s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) 	mov	%rcx,32($ctx)		# s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) 	movq	\$-1,64($ctx)		# write impossible value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) $code.=<<___	if ($flavour !~ /elf32/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) 	mov	%r10,0(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) 	mov	%r11,8(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) $code.=<<___	if ($flavour =~ /elf32/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) 	mov	%r10d,0(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) 	mov	%r11d,4(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) 	mov	\$1,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) .size	poly1305_init_base2_44,.-poly1305_init_base2_44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) .type	poly1305_blocks_vpmadd52,\@function,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) poly1305_blocks_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) 	shr	\$4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) 	jz	.Lno_data_vpmadd52		# too short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) 	shl	\$40,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) 	mov	64($ctx),%r8			# peek on power of the key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) 	# if powers of the key are not calculated yet, process up to 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) 	# blocks with this single-block subroutine, otherwise ensure that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) 	# length is divisible by 2 blocks and pass the rest down to next
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) 	# subroutine...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) 	mov	\$3,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) 	mov	\$1,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) 	cmp	\$4,$len			# is input long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) 	cmovae	%r10,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) 	test	%r8,%r8				# is power value impossible?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) 	cmovns	%r10,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) 	and	$len,%rax			# is input of favourable length?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) 	jz	.Lblocks_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) 	sub		%rax,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) 	mov		\$7,%r10d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) 	mov		\$1,%r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) 	kmovw		%r10d,%k7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) 	lea		.L2_44_inp_permd(%rip),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) 	kmovw		%r11d,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) 	vmovq		$padbit,%x#$PAD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) 	vmovdqa64	0(%r10),$inp_permd	# .L2_44_inp_permd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) 	vmovdqa64	32(%r10),$inp_shift	# .L2_44_inp_shift
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) 	vpermq		\$0xcf,$PAD,$PAD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) 	vmovdqa64	64(%r10),$reduc_mask	# .L2_44_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) 	vmovdqu64	0($ctx),${Dlo}{%k7}{z}		# load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) 	vmovdqu64	40($ctx),${r2r1r0}{%k7}{z}	# load keys
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) 	vmovdqu64	32($ctx),${r1r0s2}{%k7}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) 	vmovdqu64	24($ctx),${r0s2s1}{%k7}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) 	vmovdqa64	96(%r10),$reduc_rght	# .L2_44_shift_rgt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) 	vmovdqa64	128(%r10),$reduc_left	# .L2_44_shift_lft
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) 	jmp		.Loop_vpmadd52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) .Loop_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) 	vmovdqu32	0($inp),%x#$T0		# load input as ----3210
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) 	lea		16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) 	vpermd		$T0,$inp_permd,$T0	# ----3210 -> --322110
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) 	vpsrlvq		$inp_shift,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) 	vpandq		$reduc_mask,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) 	vporq		$PAD,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) 	vpaddq		$T0,$Dlo,$Dlo		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964) 	vpermq		\$0,$Dlo,${H0}{%k7}{z}	# smash hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) 	vpermq		\$0b01010101,$Dlo,${H1}{%k7}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) 	vpermq		\$0b10101010,$Dlo,${H2}{%k7}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) 	vpxord		$Dlo,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) 	vpxord		$Dhi,$Dhi,$Dhi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) 	vpmadd52luq	$r2r1r0,$H0,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) 	vpmadd52huq	$r2r1r0,$H0,$Dhi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) 	vpmadd52luq	$r1r0s2,$H1,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) 	vpmadd52huq	$r1r0s2,$H1,$Dhi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) 	vpmadd52luq	$r0s2s1,$H2,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) 	vpmadd52huq	$r0s2s1,$H2,$Dhi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) 	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost qword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) 	vpsllvq		$reduc_left,$Dhi,$Dhi	# 0 in topmost qword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) 	vpandq		$reduc_mask,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) 	vpaddq		$T0,$Dhi,$Dhi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986) 	vpermq		\$0b10010011,$Dhi,$Dhi	# 0 in lowest qword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) 	vpaddq		$Dhi,$Dlo,$Dlo		# note topmost qword :-)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) 	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) 	vpandq		$reduc_mask,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) 	vpermq		\$0b10010011,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) 	vpaddq		$T0,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) 	vpermq		\$0b10010011,$Dlo,${T0}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) 	vpaddq		$T0,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) 	vpsllq		\$2,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) 	vpaddq		$T0,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) 	dec		%rax			# len-=16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) 	jnz		.Loop_vpmadd52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) 	vmovdqu64	$Dlo,0($ctx){%k7}	# store hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) 	test		$len,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) 	jnz		.Lblocks_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) .Lno_data_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) .size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) # As implied by its name 4x subroutine processes 4 blocks in parallel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) # and is handled in 256-bit %ymm registers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023) my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024) my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) .type	poly1305_blocks_vpmadd52_4x,\@function,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) poly1305_blocks_vpmadd52_4x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031) 	shr	\$4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032) 	jz	.Lno_data_vpmadd52_4x		# too short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) 	shl	\$40,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) 	mov	64($ctx),%r8			# peek on power of the key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) .Lblocks_vpmadd52_4x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038) 	vpbroadcastq	$padbit,$PAD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) 	vmovdqa64	.Lx_mask44(%rip),$mask44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) 	mov		\$5,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042) 	vmovdqa64	.Lx_mask42(%rip),$mask42
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) 	kmovw		%eax,%k1		# used in 2x path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045) 	test		%r8,%r8			# is power value impossible?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) 	js		.Linit_vpmadd52		# if it is, then init R[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048) 	vmovq		0($ctx),%x#$H0		# load current hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) 	vmovq		8($ctx),%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050) 	vmovq		16($ctx),%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052) 	test		\$3,$len		# is length 4*n+2?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) 	jnz		.Lblocks_vpmadd52_2x_do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) .Lblocks_vpmadd52_4x_do:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) 	vpbroadcastq	64($ctx),$R0		# load 4th power of the key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) 	vpbroadcastq	96($ctx),$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) 	vpbroadcastq	128($ctx),$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) 	vpbroadcastq	160($ctx),$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) .Lblocks_vpmadd52_4x_key_loaded:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) 	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063) 	vpaddq		$R2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) 	vpsllq		\$2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) 	test		\$7,$len		# is len 8*n?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) 	jz		.Lblocks_vpmadd52_8x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) 	vmovdqu64	16*0($inp),$T2		# load data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070) 	vmovdqu64	16*2($inp),$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) 	lea		16*4($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) 	vpunpcklqdq	$T3,$T2,$T1		# transpose data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) 	vpunpckhqdq	$T3,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) 	# at this point 64-bit lanes are ordered as 3-1-2-0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) 	vpsrlq		\$24,$T3,$T2		# splat the data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) 	vporq		$PAD,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) 	 vpaddq		$T2,$H2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) 	vpandq		$mask44,$T1,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082) 	vpsrlq		\$44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083) 	vpsllq		\$20,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) 	vporq		$T3,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) 	vpandq		$mask44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087) 	sub		\$4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) 	jz		.Ltail_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) 	jmp		.Loop_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090) 	ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) .Linit_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) 	vmovq		24($ctx),%x#$S1		# load key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) 	vmovq		56($ctx),%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) 	vmovq		32($ctx),%x#$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) 	vmovq		40($ctx),%x#$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) 	vmovq		48($ctx),%x#$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) 	vmovdqa		$R0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) 	vmovdqa		$R1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102) 	vmovdqa		$H2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) 	mov		\$2,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) .Lmul_init_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107) 	vpxorq		$D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108) 	vpmadd52luq	$H2,$S1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) 	vpxorq		$D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) 	vpmadd52huq	$H2,$S1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) 	vpxorq		$D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) 	vpmadd52luq	$H2,$S2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) 	vpxorq		$D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114) 	vpmadd52huq	$H2,$S2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) 	vpxorq		$D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) 	vpmadd52luq	$H2,$R0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) 	vpxorq		$D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118) 	vpmadd52huq	$H2,$R0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) 	vpmadd52luq	$H0,$R0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121) 	vpmadd52huq	$H0,$R0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122) 	vpmadd52luq	$H0,$R1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) 	vpmadd52huq	$H0,$R1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124) 	vpmadd52luq	$H0,$R2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125) 	vpmadd52huq	$H0,$R2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) 	vpmadd52luq	$H1,$S2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) 	vpmadd52huq	$H1,$S2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) 	vpmadd52luq	$H1,$R0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130) 	vpmadd52huq	$H1,$R0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131) 	vpmadd52luq	$H1,$R1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) 	vpmadd52huq	$H1,$R1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) 	# partial reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) 	vpsrlq		\$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) 	vpsllq		\$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138) 	vpandq		$mask44,$D0lo,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139) 	vpaddq		$tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141) 	vpaddq		$D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) 	vpsrlq		\$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144) 	vpsllq		\$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) 	vpandq		$mask44,$D1lo,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146) 	vpaddq		$tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) 	vpaddq		$D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150) 	vpsrlq		\$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) 	vpsllq		\$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) 	vpandq		$mask42,$D2lo,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153) 	vpaddq		$tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) 	vpaddq		$D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) 	vpsllq		\$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) 	vpaddq		$D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) 	vpsrlq		\$44,$H0,$tmp		# additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) 	vpandq		$mask44,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) 	vpaddq		$tmp,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) 	dec		%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166) 	jz		.Ldone_init_vpmadd52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) 	vpunpcklqdq	$R1,$H1,$R1		# 1,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169) 	vpbroadcastq	%x#$H1,%x#$H1		# 2,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) 	vpunpcklqdq	$R2,$H2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171) 	vpbroadcastq	%x#$H2,%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) 	vpunpcklqdq	$R0,$H0,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) 	vpbroadcastq	%x#$H0,%x#$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175) 	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) 	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) 	vpaddq		$R1,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) 	vpaddq		$R2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) 	vpsllq		\$2,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180) 	vpsllq		\$2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182) 	jmp		.Lmul_init_vpmadd52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) 	ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) .Ldone_init_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) 	vinserti128	\$1,%x#$R1,$H1,$R1	# 1,2,3,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) 	vinserti128	\$1,%x#$R2,$H2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) 	vinserti128	\$1,%x#$R0,$H0,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) 	vpermq		\$0b11011000,$R1,$R1	# 1,3,2,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) 	vpermq		\$0b11011000,$R2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193) 	vpermq		\$0b11011000,$R0,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) 	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) 	vpaddq		$R1,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) 	vpsllq		\$2,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) 	vmovq		0($ctx),%x#$H0		# load current hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) 	vmovq		8($ctx),%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) 	vmovq		16($ctx),%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) 	test		\$3,$len		# is length 4*n+2?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) 	jnz		.Ldone_init_vpmadd52_2x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) 	vmovdqu64	$R0,64($ctx)		# save key powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) 	vpbroadcastq	%x#$R0,$R0		# broadcast 4th power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) 	vmovdqu64	$R1,96($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) 	vpbroadcastq	%x#$R1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210) 	vmovdqu64	$R2,128($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211) 	vpbroadcastq	%x#$R2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) 	vmovdqu64	$S1,160($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) 	vpbroadcastq	%x#$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) 	jmp		.Lblocks_vpmadd52_4x_key_loaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) 	ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219) .Ldone_init_vpmadd52_2x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) 	vmovdqu64	$R0,64($ctx)		# save key powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221) 	vpsrldq		\$8,$R0,$R0		# 0-1-0-2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) 	vmovdqu64	$R1,96($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223) 	vpsrldq		\$8,$R1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) 	vmovdqu64	$R2,128($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) 	vpsrldq		\$8,$R2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) 	vmovdqu64	$S1,160($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) 	vpsrldq		\$8,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228) 	jmp		.Lblocks_vpmadd52_2x_key_loaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) 	ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) .Lblocks_vpmadd52_2x_do:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) 	vmovdqu64	128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234) 	vmovdqu64	160+8($ctx),${S1}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) 	vmovdqu64	64+8($ctx),${R0}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) 	vmovdqu64	96+8($ctx),${R1}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) .Lblocks_vpmadd52_2x_key_loaded:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) 	vmovdqu64	16*0($inp),$T2		# load data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) 	vpxorq		$T3,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241) 	lea		16*2($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243) 	vpunpcklqdq	$T3,$T2,$T1		# transpose data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) 	vpunpckhqdq	$T3,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) 	# at this point 64-bit lanes are ordered as x-1-x-0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248) 	vpsrlq		\$24,$T3,$T2		# splat the data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) 	vporq		$PAD,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) 	 vpaddq		$T2,$H2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) 	vpandq		$mask44,$T1,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) 	vpsrlq		\$44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) 	vpsllq		\$20,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) 	vporq		$T3,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) 	vpandq		$mask44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) 	jmp		.Ltail_vpmadd52_2x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) 	ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) .Loop_vpmadd52_4x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) 	#vpaddq		$T2,$H2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) 	vpaddq		$T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) 	vpaddq		$T1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266) 	vpxorq		$D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) 	vpmadd52luq	$H2,$S1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268) 	vpxorq		$D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269) 	vpmadd52huq	$H2,$S1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) 	vpxorq		$D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271) 	vpmadd52luq	$H2,$S2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272) 	vpxorq		$D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273) 	vpmadd52huq	$H2,$S2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274) 	vpxorq		$D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275) 	vpmadd52luq	$H2,$R0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) 	vpxorq		$D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) 	vpmadd52huq	$H2,$R0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) 	 vmovdqu64	16*0($inp),$T2		# load data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) 	 vmovdqu64	16*2($inp),$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281) 	 lea		16*4($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) 	vpmadd52luq	$H0,$R0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) 	vpmadd52huq	$H0,$R0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) 	vpmadd52luq	$H0,$R1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) 	vpmadd52huq	$H0,$R1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) 	vpmadd52luq	$H0,$R2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) 	vpmadd52huq	$H0,$R2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) 	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) 	 vpunpckhqdq	$T3,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) 	vpmadd52luq	$H1,$S2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292) 	vpmadd52huq	$H1,$S2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) 	vpmadd52luq	$H1,$R0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) 	vpmadd52huq	$H1,$R0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) 	vpmadd52luq	$H1,$R1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) 	vpmadd52huq	$H1,$R1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299) 	# partial reduction (interleaved with data splat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) 	vpsrlq		\$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) 	vpsllq		\$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302) 	vpandq		$mask44,$D0lo,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) 	vpaddq		$tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) 	 vpsrlq		\$24,$T3,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) 	 vporq		$PAD,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) 	vpaddq		$D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) 	vpsrlq		\$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) 	vpsllq		\$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) 	vpandq		$mask44,$D1lo,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312) 	vpaddq		$tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314) 	 vpandq		$mask44,$T1,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) 	 vpsrlq		\$44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316) 	 vpsllq		\$20,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) 	vpaddq		$D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) 	vpsrlq		\$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) 	vpsllq		\$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) 	vpandq		$mask42,$D2lo,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322) 	vpaddq		$tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324) 	  vpaddq	$T2,$H2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325) 	vpaddq		$D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) 	vpsllq		\$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) 	vpaddq		$D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) 	 vporq		$T3,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) 	 vpandq		$mask44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332) 	vpsrlq		\$44,$H0,$tmp		# additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) 	vpandq		$mask44,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335) 	vpaddq		$tmp,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) 	sub		\$4,$len		# len-=64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) 	jnz		.Loop_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340) .Ltail_vpmadd52_4x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) 	vmovdqu64	128($ctx),$R2		# load all key powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) 	vmovdqu64	160($ctx),$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343) 	vmovdqu64	64($ctx),$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) 	vmovdqu64	96($ctx),$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346) .Ltail_vpmadd52_2x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) 	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) 	vpaddq		$R2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) 	vpsllq		\$2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351) 	#vpaddq		$T2,$H2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) 	vpaddq		$T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353) 	vpaddq		$T1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) 	vpxorq		$D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) 	vpmadd52luq	$H2,$S1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) 	vpxorq		$D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358) 	vpmadd52huq	$H2,$S1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) 	vpxorq		$D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360) 	vpmadd52luq	$H2,$S2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) 	vpxorq		$D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) 	vpmadd52huq	$H2,$S2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) 	vpxorq		$D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364) 	vpmadd52luq	$H2,$R0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) 	vpxorq		$D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) 	vpmadd52huq	$H2,$R0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) 	vpmadd52luq	$H0,$R0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) 	vpmadd52huq	$H0,$R0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) 	vpmadd52luq	$H0,$R1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) 	vpmadd52huq	$H0,$R1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372) 	vpmadd52luq	$H0,$R2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) 	vpmadd52huq	$H0,$R2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375) 	vpmadd52luq	$H1,$S2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) 	vpmadd52huq	$H1,$S2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) 	vpmadd52luq	$H1,$R0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) 	vpmadd52huq	$H1,$R0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379) 	vpmadd52luq	$H1,$R1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) 	vpmadd52huq	$H1,$R1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383) 	# horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385) 	mov		\$1,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) 	kmovw		%eax,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) 	vpsrldq		\$8,$D0lo,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388) 	vpsrldq		\$8,$D0hi,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) 	vpsrldq		\$8,$D1lo,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390) 	vpsrldq		\$8,$D1hi,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391) 	vpaddq		$T0,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) 	vpaddq		$H0,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393) 	vpsrldq		\$8,$D2lo,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) 	vpsrldq		\$8,$D2hi,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) 	vpaddq		$T1,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396) 	vpaddq		$H1,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397) 	 vpermq		\$0x2,$D0lo,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) 	 vpermq		\$0x2,$D0hi,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) 	vpaddq		$T2,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) 	vpaddq		$H2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) 	vpermq		\$0x2,$D1lo,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) 	vpermq		\$0x2,$D1hi,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) 	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) 	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) 	vpermq		\$0x2,$D2lo,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) 	vpermq		\$0x2,$D2hi,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) 	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) 	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) 	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411) 	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414) 	# partial reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) 	vpsrlq		\$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) 	vpsllq		\$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417) 	vpandq		$mask44,$D0lo,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) 	vpaddq		$tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3420) 	vpaddq		$D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3421) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3422) 	vpsrlq		\$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3423) 	vpsllq		\$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3424) 	vpandq		$mask44,$D1lo,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3425) 	vpaddq		$tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3426) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3427) 	vpaddq		$D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3428) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3429) 	vpsrlq		\$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3430) 	vpsllq		\$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3431) 	vpandq		$mask42,$D2lo,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3432) 	vpaddq		$tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3433) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3434) 	vpaddq		$D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3435) 	vpsllq		\$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3436) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3437) 	vpaddq		$D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3438) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3439) 	vpsrlq		\$44,$H0,$tmp		# additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3440) 	vpandq		$mask44,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3441) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3442) 	vpaddq		$tmp,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3443) 						# at this point $len is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3444) 						# either 4*n+2 or 0...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3445) 	sub		\$2,$len		# len-=32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3446) 	ja		.Lblocks_vpmadd52_4x_do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3447) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3448) 	vmovq		%x#$H0,0($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3449) 	vmovq		%x#$H1,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3450) 	vmovq		%x#$H2,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3451) 	vzeroall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3452) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3453) .Lno_data_vpmadd52_4x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3454) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3455) .size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3456) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3457) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3458) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3459) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3460) # As implied by its name 8x subroutine processes 8 blocks in parallel...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3461) # This is intermediate version, as it's used only in cases when input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3462) # length is either 8*n, 8*n+1 or 8*n+2...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3463) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3464) my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3465) my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3466) my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3467) my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3468) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3469) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3470) .type	poly1305_blocks_vpmadd52_8x,\@function,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3471) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3472) poly1305_blocks_vpmadd52_8x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3473) 	shr	\$4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3474) 	jz	.Lno_data_vpmadd52_8x		# too short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3475) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3476) 	shl	\$40,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3477) 	mov	64($ctx),%r8			# peek on power of the key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3478) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3479) 	vmovdqa64	.Lx_mask44(%rip),$mask44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3480) 	vmovdqa64	.Lx_mask42(%rip),$mask42
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3481) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3482) 	test	%r8,%r8				# is power value impossible?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3483) 	js	.Linit_vpmadd52			# if it is, then init R[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3484) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3485) 	vmovq	0($ctx),%x#$H0			# load current hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3486) 	vmovq	8($ctx),%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3487) 	vmovq	16($ctx),%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3488) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3489) .Lblocks_vpmadd52_8x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3490) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3491) 	# fist we calculate more key powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3492) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3493) 	vmovdqu64	128($ctx),$R2		# load 1-3-2-4 powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3494) 	vmovdqu64	160($ctx),$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3495) 	vmovdqu64	64($ctx),$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3496) 	vmovdqu64	96($ctx),$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3497) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3498) 	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3499) 	vpaddq		$R2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3500) 	vpsllq		\$2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3501) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3502) 	vpbroadcastq	%x#$R2,$RR2		# broadcast 4th power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3503) 	vpbroadcastq	%x#$R0,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3504) 	vpbroadcastq	%x#$R1,$RR1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3505) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3506) 	vpxorq		$D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3507) 	vpmadd52luq	$RR2,$S1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3508) 	vpxorq		$D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3509) 	vpmadd52huq	$RR2,$S1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3510) 	vpxorq		$D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3511) 	vpmadd52luq	$RR2,$S2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3512) 	vpxorq		$D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3513) 	vpmadd52huq	$RR2,$S2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3514) 	vpxorq		$D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3515) 	vpmadd52luq	$RR2,$R0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3516) 	vpxorq		$D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3517) 	vpmadd52huq	$RR2,$R0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3518) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3519) 	vpmadd52luq	$RR0,$R0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3520) 	vpmadd52huq	$RR0,$R0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3521) 	vpmadd52luq	$RR0,$R1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3522) 	vpmadd52huq	$RR0,$R1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3523) 	vpmadd52luq	$RR0,$R2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3524) 	vpmadd52huq	$RR0,$R2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3525) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3526) 	vpmadd52luq	$RR1,$S2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3527) 	vpmadd52huq	$RR1,$S2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3528) 	vpmadd52luq	$RR1,$R0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3529) 	vpmadd52huq	$RR1,$R0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3530) 	vpmadd52luq	$RR1,$R1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3531) 	vpmadd52huq	$RR1,$R1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3532) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3533) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3534) 	# partial reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3535) 	vpsrlq		\$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3536) 	vpsllq		\$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3537) 	vpandq		$mask44,$D0lo,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3538) 	vpaddq		$tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3539) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3540) 	vpaddq		$D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3541) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3542) 	vpsrlq		\$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3543) 	vpsllq		\$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3544) 	vpandq		$mask44,$D1lo,$RR1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3545) 	vpaddq		$tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3546) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3547) 	vpaddq		$D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3548) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3549) 	vpsrlq		\$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3550) 	vpsllq		\$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3551) 	vpandq		$mask42,$D2lo,$RR2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3552) 	vpaddq		$tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3553) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3554) 	vpaddq		$D2hi,$RR0,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3555) 	vpsllq		\$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3556) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3557) 	vpaddq		$D2hi,$RR0,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3558) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3559) 	vpsrlq		\$44,$RR0,$tmp		# additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3560) 	vpandq		$mask44,$RR0,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3561) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3562) 	vpaddq		$tmp,$RR1,$RR1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3563) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3564) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3565) 	# At this point Rx holds 1324 powers, RRx - 5768, and the goal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3566) 	# is 15263748, which reflects how data is loaded...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3567) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3568) 	vpunpcklqdq	$R2,$RR2,$T2		# 3748
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3569) 	vpunpckhqdq	$R2,$RR2,$R2		# 1526
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3570) 	vpunpcklqdq	$R0,$RR0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3571) 	vpunpckhqdq	$R0,$RR0,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3572) 	vpunpcklqdq	$R1,$RR1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3573) 	vpunpckhqdq	$R1,$RR1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3574) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3575) ######## switch to %zmm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3576) map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3577) map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3578) map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3579) map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3580) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3581) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3582) 	vshufi64x2	\$0x44,$R2,$T2,$RR2	# 15263748
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3583) 	vshufi64x2	\$0x44,$R0,$T0,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3584) 	vshufi64x2	\$0x44,$R1,$T1,$RR1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3585) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3586) 	vmovdqu64	16*0($inp),$T2		# load data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3587) 	vmovdqu64	16*4($inp),$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3588) 	lea		16*8($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3589) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3590) 	vpsllq		\$2,$RR2,$SS2		# S2 = R2*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3591) 	vpsllq		\$2,$RR1,$SS1		# S1 = R1*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3592) 	vpaddq		$RR2,$SS2,$SS2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3593) 	vpaddq		$RR1,$SS1,$SS1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3594) 	vpsllq		\$2,$SS2,$SS2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3595) 	vpsllq		\$2,$SS1,$SS1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3596) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3597) 	vpbroadcastq	$padbit,$PAD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3598) 	vpbroadcastq	%x#$mask44,$mask44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3599) 	vpbroadcastq	%x#$mask42,$mask42
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3600) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3601) 	vpbroadcastq	%x#$SS1,$S1		# broadcast 8th power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3602) 	vpbroadcastq	%x#$SS2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3603) 	vpbroadcastq	%x#$RR0,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3604) 	vpbroadcastq	%x#$RR1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3605) 	vpbroadcastq	%x#$RR2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3606) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3607) 	vpunpcklqdq	$T3,$T2,$T1		# transpose data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3608) 	vpunpckhqdq	$T3,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3609) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3610) 	# at this point 64-bit lanes are ordered as 73625140
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3611) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3612) 	vpsrlq		\$24,$T3,$T2		# splat the data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3613) 	vporq		$PAD,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3614) 	 vpaddq		$T2,$H2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3615) 	vpandq		$mask44,$T1,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3616) 	vpsrlq		\$44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3617) 	vpsllq		\$20,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3618) 	vporq		$T3,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3619) 	vpandq		$mask44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3620) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3621) 	sub		\$8,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3622) 	jz		.Ltail_vpmadd52_8x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3623) 	jmp		.Loop_vpmadd52_8x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3624) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3625) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3626) .Loop_vpmadd52_8x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3627) 	#vpaddq		$T2,$H2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3628) 	vpaddq		$T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3629) 	vpaddq		$T1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3630) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3631) 	vpxorq		$D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3632) 	vpmadd52luq	$H2,$S1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3633) 	vpxorq		$D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3634) 	vpmadd52huq	$H2,$S1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3635) 	vpxorq		$D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3636) 	vpmadd52luq	$H2,$S2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3637) 	vpxorq		$D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3638) 	vpmadd52huq	$H2,$S2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3639) 	vpxorq		$D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3640) 	vpmadd52luq	$H2,$R0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3641) 	vpxorq		$D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3642) 	vpmadd52huq	$H2,$R0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3643) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3644) 	 vmovdqu64	16*0($inp),$T2		# load data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3645) 	 vmovdqu64	16*4($inp),$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3646) 	 lea		16*8($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3647) 	vpmadd52luq	$H0,$R0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3648) 	vpmadd52huq	$H0,$R0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3649) 	vpmadd52luq	$H0,$R1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3650) 	vpmadd52huq	$H0,$R1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3651) 	vpmadd52luq	$H0,$R2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3652) 	vpmadd52huq	$H0,$R2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3653) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3654) 	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3655) 	 vpunpckhqdq	$T3,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3656) 	vpmadd52luq	$H1,$S2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3657) 	vpmadd52huq	$H1,$S2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3658) 	vpmadd52luq	$H1,$R0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3659) 	vpmadd52huq	$H1,$R0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3660) 	vpmadd52luq	$H1,$R1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3661) 	vpmadd52huq	$H1,$R1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3662) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3663) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3664) 	# partial reduction (interleaved with data splat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3665) 	vpsrlq		\$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3666) 	vpsllq		\$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3667) 	vpandq		$mask44,$D0lo,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3668) 	vpaddq		$tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3669) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3670) 	 vpsrlq		\$24,$T3,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3671) 	 vporq		$PAD,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3672) 	vpaddq		$D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3673) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3674) 	vpsrlq		\$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3675) 	vpsllq		\$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3676) 	vpandq		$mask44,$D1lo,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3677) 	vpaddq		$tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3678) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3679) 	 vpandq		$mask44,$T1,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3680) 	 vpsrlq		\$44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3681) 	 vpsllq		\$20,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3682) 	vpaddq		$D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3683) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3684) 	vpsrlq		\$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3685) 	vpsllq		\$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3686) 	vpandq		$mask42,$D2lo,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3687) 	vpaddq		$tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3688) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3689) 	  vpaddq	$T2,$H2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3690) 	vpaddq		$D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3691) 	vpsllq		\$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3692) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3693) 	vpaddq		$D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3694) 	 vporq		$T3,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3695) 	 vpandq		$mask44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3696) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3697) 	vpsrlq		\$44,$H0,$tmp		# additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3698) 	vpandq		$mask44,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3699) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3700) 	vpaddq		$tmp,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3701) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3702) 	sub		\$8,$len		# len-=128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3703) 	jnz		.Loop_vpmadd52_8x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3704) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3705) .Ltail_vpmadd52_8x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3706) 	#vpaddq		$T2,$H2,$H2		# accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3707) 	vpaddq		$T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3708) 	vpaddq		$T1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3709) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3710) 	vpxorq		$D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3711) 	vpmadd52luq	$H2,$SS1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3712) 	vpxorq		$D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3713) 	vpmadd52huq	$H2,$SS1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3714) 	vpxorq		$D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3715) 	vpmadd52luq	$H2,$SS2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3716) 	vpxorq		$D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3717) 	vpmadd52huq	$H2,$SS2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3718) 	vpxorq		$D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3719) 	vpmadd52luq	$H2,$RR0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3720) 	vpxorq		$D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3721) 	vpmadd52huq	$H2,$RR0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3722) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3723) 	vpmadd52luq	$H0,$RR0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3724) 	vpmadd52huq	$H0,$RR0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3725) 	vpmadd52luq	$H0,$RR1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3726) 	vpmadd52huq	$H0,$RR1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3727) 	vpmadd52luq	$H0,$RR2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3728) 	vpmadd52huq	$H0,$RR2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3729) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3730) 	vpmadd52luq	$H1,$SS2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3731) 	vpmadd52huq	$H1,$SS2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3732) 	vpmadd52luq	$H1,$RR0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3733) 	vpmadd52huq	$H1,$RR0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3734) 	vpmadd52luq	$H1,$RR1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3735) 	vpmadd52huq	$H1,$RR1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3736) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3737) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3738) 	# horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3739) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3740) 	mov		\$1,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3741) 	kmovw		%eax,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3742) 	vpsrldq		\$8,$D0lo,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3743) 	vpsrldq		\$8,$D0hi,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3744) 	vpsrldq		\$8,$D1lo,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3745) 	vpsrldq		\$8,$D1hi,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3746) 	vpaddq		$T0,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3747) 	vpaddq		$H0,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3748) 	vpsrldq		\$8,$D2lo,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3749) 	vpsrldq		\$8,$D2hi,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3750) 	vpaddq		$T1,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3751) 	vpaddq		$H1,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3752) 	 vpermq		\$0x2,$D0lo,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3753) 	 vpermq		\$0x2,$D0hi,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3754) 	vpaddq		$T2,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3755) 	vpaddq		$H2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3756) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3757) 	vpermq		\$0x2,$D1lo,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3758) 	vpermq		\$0x2,$D1hi,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3759) 	vpaddq		$T0,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3760) 	vpaddq		$H0,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3761) 	vpermq		\$0x2,$D2lo,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3762) 	vpermq		\$0x2,$D2hi,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3763) 	vpaddq		$T1,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3764) 	vpaddq		$H1,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3765) 	 vextracti64x4	\$1,$D0lo,%y#$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3766) 	 vextracti64x4	\$1,$D0hi,%y#$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3767) 	vpaddq		$T2,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3768) 	vpaddq		$H2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3769) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3770) 	vextracti64x4	\$1,$D1lo,%y#$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3771) 	vextracti64x4	\$1,$D1hi,%y#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3772) 	vextracti64x4	\$1,$D2lo,%y#$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3773) 	vextracti64x4	\$1,$D2hi,%y#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3774) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3775) ######## switch back to %ymm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3776) map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3777) map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3778) map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3779) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3780) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3781) 	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3782) 	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3783) 	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3784) 	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3785) 	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3786) 	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3787) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3788) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3789) 	# partial reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3790) 	vpsrlq		\$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3791) 	vpsllq		\$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3792) 	vpandq		$mask44,$D0lo,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3793) 	vpaddq		$tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3794) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3795) 	vpaddq		$D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3796) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3797) 	vpsrlq		\$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3798) 	vpsllq		\$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3799) 	vpandq		$mask44,$D1lo,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3800) 	vpaddq		$tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3801) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3802) 	vpaddq		$D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3803) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3804) 	vpsrlq		\$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3805) 	vpsllq		\$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3806) 	vpandq		$mask42,$D2lo,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3807) 	vpaddq		$tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3808) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3809) 	vpaddq		$D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3810) 	vpsllq		\$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3811) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3812) 	vpaddq		$D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3813) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3814) 	vpsrlq		\$44,$H0,$tmp		# additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3815) 	vpandq		$mask44,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3816) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3817) 	vpaddq		$tmp,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3818) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3819) 	################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3820) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3821) 	vmovq		%x#$H0,0($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3822) 	vmovq		%x#$H1,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3823) 	vmovq		%x#$H2,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3824) 	vzeroall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3825) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3826) .Lno_data_vpmadd52_8x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3827) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3828) .size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3829) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3830) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3831) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3832) .type	poly1305_emit_base2_44,\@function,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3833) .align	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3834) poly1305_emit_base2_44:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3835) 	mov	0($ctx),%r8	# load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3836) 	mov	8($ctx),%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3837) 	mov	16($ctx),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3838) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3839) 	mov	%r9,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3840) 	shr	\$20,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3841) 	shl	\$44,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3842) 	mov	%r10,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3843) 	shr	\$40,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3844) 	shl	\$24,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3845) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3846) 	add	%rax,%r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3847) 	adc	%rcx,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3848) 	adc	\$0,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3849) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3850) 	mov	%r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3851) 	add	\$5,%r8		# compare to modulus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3852) 	mov	%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3853) 	adc	\$0,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3854) 	adc	\$0,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3855) 	shr	\$2,%r10	# did 130-bit value overflow?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3856) 	cmovnz	%r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3857) 	cmovnz	%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3858) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3859) 	add	0($nonce),%rax	# accumulate nonce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3860) 	adc	8($nonce),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3861) 	mov	%rax,0($mac)	# write result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3862) 	mov	%rcx,8($mac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3863) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3864) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3865) .size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3866) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3867) }	}	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3868) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3869) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3870) if (!$kernel)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3871) {	# chacha20-poly1305 helpers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3872) my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3873)                                   ("%rdi","%rsi","%rdx","%rcx");  # Unix order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3874) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3875) .globl	xor128_encrypt_n_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3876) .type	xor128_encrypt_n_pad,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3877) .align	16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3878) xor128_encrypt_n_pad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3879) 	sub	$otp,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3880) 	sub	$otp,$out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3881) 	mov	$len,%r10		# put len aside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3882) 	shr	\$4,$len		# len / 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3883) 	jz	.Ltail_enc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3884) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3885) .Loop_enc_xmm:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3886) 	movdqu	($inp,$otp),%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3887) 	pxor	($otp),%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3888) 	movdqu	%xmm0,($out,$otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3889) 	movdqa	%xmm0,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3890) 	lea	16($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3891) 	dec	$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3892) 	jnz	.Loop_enc_xmm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3893) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3894) 	and	\$15,%r10		# len % 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3895) 	jz	.Ldone_enc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3896) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3897) .Ltail_enc:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3898) 	mov	\$16,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3899) 	sub	%r10,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3900) 	xor	%eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3901) .Loop_enc_byte:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3902) 	mov	($inp,$otp),%al
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3903) 	xor	($otp),%al
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3904) 	mov	%al,($out,$otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3905) 	mov	%al,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3906) 	lea	1($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3907) 	dec	%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3908) 	jnz	.Loop_enc_byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3909) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3910) 	xor	%eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3911) .Loop_enc_pad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3912) 	mov	%al,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3913) 	lea	1($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3914) 	dec	$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3915) 	jnz	.Loop_enc_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3916) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3917) .Ldone_enc:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3918) 	mov	$otp,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3919) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3920) .size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3921) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3922) .globl	xor128_decrypt_n_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3923) .type	xor128_decrypt_n_pad,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3924) .align	16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3925) xor128_decrypt_n_pad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3926) 	sub	$otp,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3927) 	sub	$otp,$out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3928) 	mov	$len,%r10		# put len aside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3929) 	shr	\$4,$len		# len / 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3930) 	jz	.Ltail_dec
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3931) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3932) .Loop_dec_xmm:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3933) 	movdqu	($inp,$otp),%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3934) 	movdqa	($otp),%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3935) 	pxor	%xmm0,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3936) 	movdqu	%xmm1,($out,$otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3937) 	movdqa	%xmm0,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3938) 	lea	16($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3939) 	dec	$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3940) 	jnz	.Loop_dec_xmm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3941) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3942) 	pxor	%xmm1,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3943) 	and	\$15,%r10		# len % 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3944) 	jz	.Ldone_dec
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3945) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3946) .Ltail_dec:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3947) 	mov	\$16,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3948) 	sub	%r10,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3949) 	xor	%eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3950) 	xor	%r11d,%r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3951) .Loop_dec_byte:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3952) 	mov	($inp,$otp),%r11b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3953) 	mov	($otp),%al
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3954) 	xor	%r11b,%al
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3955) 	mov	%al,($out,$otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3956) 	mov	%r11b,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3957) 	lea	1($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3958) 	dec	%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3959) 	jnz	.Loop_dec_byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3960) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3961) 	xor	%eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3962) .Loop_dec_pad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3963) 	mov	%al,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3964) 	lea	1($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3965) 	dec	$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3966) 	jnz	.Loop_dec_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3967) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3968) .Ldone_dec:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3969) 	mov	$otp,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3970) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3971) .size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3972) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3973) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3974) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3975) # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3976) #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3977) if ($win64) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3978) $rec="%rcx";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3979) $frame="%rdx";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3980) $context="%r8";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3981) $disp="%r9";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3982) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3983) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3984) .extern	__imp_RtlVirtualUnwind
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3985) .type	se_handler,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3986) .align	16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3987) se_handler:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3988) 	push	%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3989) 	push	%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3990) 	push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3991) 	push	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3992) 	push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3993) 	push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3994) 	push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3995) 	push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3996) 	pushfq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3997) 	sub	\$64,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3998) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3999) 	mov	120($context),%rax	# pull context->Rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4000) 	mov	248($context),%rbx	# pull context->Rip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4001) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4002) 	mov	8($disp),%rsi		# disp->ImageBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4003) 	mov	56($disp),%r11		# disp->HandlerData
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4004) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4005) 	mov	0(%r11),%r10d		# HandlerData[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4006) 	lea	(%rsi,%r10),%r10	# prologue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4007) 	cmp	%r10,%rbx		# context->Rip<.Lprologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4008) 	jb	.Lcommon_seh_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4009) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4010) 	mov	152($context),%rax	# pull context->Rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4011) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4012) 	mov	4(%r11),%r10d		# HandlerData[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4013) 	lea	(%rsi,%r10),%r10	# epilogue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4014) 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4015) 	jae	.Lcommon_seh_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4016) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4017) 	lea	48(%rax),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4018) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4019) 	mov	-8(%rax),%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4020) 	mov	-16(%rax),%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4021) 	mov	-24(%rax),%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4022) 	mov	-32(%rax),%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4023) 	mov	-40(%rax),%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4024) 	mov	-48(%rax),%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4025) 	mov	%rbx,144($context)	# restore context->Rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4026) 	mov	%rbp,160($context)	# restore context->Rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4027) 	mov	%r12,216($context)	# restore context->R12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4028) 	mov	%r13,224($context)	# restore context->R13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4029) 	mov	%r14,232($context)	# restore context->R14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4030) 	mov	%r15,240($context)	# restore context->R14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4031) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4032) 	jmp	.Lcommon_seh_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4033) .size	se_handler,.-se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4034) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4035) .type	avx_handler,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4036) .align	16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4037) avx_handler:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4038) 	push	%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4039) 	push	%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4040) 	push	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4041) 	push	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4042) 	push	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4043) 	push	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4044) 	push	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4045) 	push	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4046) 	pushfq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4047) 	sub	\$64,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4048) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4049) 	mov	120($context),%rax	# pull context->Rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4050) 	mov	248($context),%rbx	# pull context->Rip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4051) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4052) 	mov	8($disp),%rsi		# disp->ImageBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4053) 	mov	56($disp),%r11		# disp->HandlerData
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4054) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4055) 	mov	0(%r11),%r10d		# HandlerData[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4056) 	lea	(%rsi,%r10),%r10	# prologue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4057) 	cmp	%r10,%rbx		# context->Rip<prologue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4058) 	jb	.Lcommon_seh_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4059) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4060) 	mov	152($context),%rax	# pull context->Rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4061) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4062) 	mov	4(%r11),%r10d		# HandlerData[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4063) 	lea	(%rsi,%r10),%r10	# epilogue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4064) 	cmp	%r10,%rbx		# context->Rip>=epilogue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4065) 	jae	.Lcommon_seh_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4066) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4067) 	mov	208($context),%rax	# pull context->R11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4068) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4069) 	lea	0x50(%rax),%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4070) 	lea	0xf8(%rax),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4071) 	lea	512($context),%rdi	# &context.Xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4072) 	mov	\$20,%ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4073) 	.long	0xa548f3fc		# cld; rep movsq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4074) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4075) .Lcommon_seh_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4076) 	mov	8(%rax),%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4077) 	mov	16(%rax),%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4078) 	mov	%rax,152($context)	# restore context->Rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4079) 	mov	%rsi,168($context)	# restore context->Rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4080) 	mov	%rdi,176($context)	# restore context->Rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4081) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4082) 	mov	40($disp),%rdi		# disp->ContextRecord
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4083) 	mov	$context,%rsi		# context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4084) 	mov	\$154,%ecx		# sizeof(CONTEXT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4085) 	.long	0xa548f3fc		# cld; rep movsq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4086) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4087) 	mov	$disp,%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4088) 	xor	%ecx,%ecx		# arg1, UNW_FLAG_NHANDLER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4089) 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4090) 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4091) 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4092) 	mov	40(%rsi),%r10		# disp->ContextRecord
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4093) 	lea	56(%rsi),%r11		# &disp->HandlerData
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4094) 	lea	24(%rsi),%r12		# &disp->EstablisherFrame
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4095) 	mov	%r10,32(%rsp)		# arg5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4096) 	mov	%r11,40(%rsp)		# arg6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4097) 	mov	%r12,48(%rsp)		# arg7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4098) 	mov	%rcx,56(%rsp)		# arg8, (NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4099) 	call	*__imp_RtlVirtualUnwind(%rip)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4100) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4101) 	mov	\$1,%eax		# ExceptionContinueSearch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4102) 	add	\$64,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4103) 	popfq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4104) 	pop	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4105) 	pop	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4106) 	pop	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4107) 	pop	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4108) 	pop	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4109) 	pop	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4110) 	pop	%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4111) 	pop	%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4112) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4113) .size	avx_handler,.-avx_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4114) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4115) .section	.pdata
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4116) .align	4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4117) 	.rva	.LSEH_begin_poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4118) 	.rva	.LSEH_end_poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4119) 	.rva	.LSEH_info_poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4120) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4121) 	.rva	.LSEH_begin_poly1305_blocks_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4122) 	.rva	.LSEH_end_poly1305_blocks_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4123) 	.rva	.LSEH_info_poly1305_blocks_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4124) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4125) 	.rva	.LSEH_begin_poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4126) 	.rva	.LSEH_end_poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4127) 	.rva	.LSEH_info_poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4128) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4129) $code.=<<___ if ($avx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4130) 	.rva	.LSEH_begin_poly1305_blocks_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4131) 	.rva	.Lbase2_64_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4132) 	.rva	.LSEH_info_poly1305_blocks_avx_1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4133) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4134) 	.rva	.Lbase2_64_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4135) 	.rva	.Leven_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4136) 	.rva	.LSEH_info_poly1305_blocks_avx_2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4137) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4138) 	.rva	.Leven_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4139) 	.rva	.LSEH_end_poly1305_blocks_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4140) 	.rva	.LSEH_info_poly1305_blocks_avx_3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4141) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4142) 	.rva	.LSEH_begin_poly1305_emit_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4143) 	.rva	.LSEH_end_poly1305_emit_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4144) 	.rva	.LSEH_info_poly1305_emit_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4145) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4146) $code.=<<___ if ($avx>1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4147) 	.rva	.LSEH_begin_poly1305_blocks_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4148) 	.rva	.Lbase2_64_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4149) 	.rva	.LSEH_info_poly1305_blocks_avx2_1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4150) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4151) 	.rva	.Lbase2_64_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4152) 	.rva	.Leven_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4153) 	.rva	.LSEH_info_poly1305_blocks_avx2_2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4155) 	.rva	.Leven_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4156) 	.rva	.LSEH_end_poly1305_blocks_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4157) 	.rva	.LSEH_info_poly1305_blocks_avx2_3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4158) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4159) $code.=<<___ if ($avx>2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4160) 	.rva	.LSEH_begin_poly1305_blocks_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4161) 	.rva	.LSEH_end_poly1305_blocks_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4162) 	.rva	.LSEH_info_poly1305_blocks_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4163) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4164) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4165) .section	.xdata
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4166) .align	8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4167) .LSEH_info_poly1305_init_x86_64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4168) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4169) 	.rva	se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4170) 	.rva	.LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4171) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4172) .LSEH_info_poly1305_blocks_x86_64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4173) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4174) 	.rva	se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4175) 	.rva	.Lblocks_body,.Lblocks_epilogue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4176) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4177) .LSEH_info_poly1305_emit_x86_64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4178) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4179) 	.rva	se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4180) 	.rva	.LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4181) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4182) $code.=<<___ if ($avx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4183) .LSEH_info_poly1305_blocks_avx_1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4184) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4185) 	.rva	se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4186) 	.rva	.Lblocks_avx_body,.Lblocks_avx_epilogue		# HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4187) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4188) .LSEH_info_poly1305_blocks_avx_2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4189) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4190) 	.rva	se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4191) 	.rva	.Lbase2_64_avx_body,.Lbase2_64_avx_epilogue	# HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4192) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4193) .LSEH_info_poly1305_blocks_avx_3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4194) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4195) 	.rva	avx_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4196) 	.rva	.Ldo_avx_body,.Ldo_avx_epilogue			# HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4197) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4198) .LSEH_info_poly1305_emit_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4199) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4200) 	.rva	se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4201) 	.rva	.LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4202) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4203) $code.=<<___ if ($avx>1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4204) .LSEH_info_poly1305_blocks_avx2_1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4205) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4206) 	.rva	se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4207) 	.rva	.Lblocks_avx2_body,.Lblocks_avx2_epilogue	# HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4208) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4209) .LSEH_info_poly1305_blocks_avx2_2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4210) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4211) 	.rva	se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4212) 	.rva	.Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue	# HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4213) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4214) .LSEH_info_poly1305_blocks_avx2_3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4215) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4216) 	.rva	avx_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4217) 	.rva	.Ldo_avx2_body,.Ldo_avx2_epilogue		# HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4218) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4219) $code.=<<___ if ($avx>2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4220) .LSEH_info_poly1305_blocks_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4221) 	.byte	9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4222) 	.rva	avx_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4223) 	.rva	.Ldo_avx512_body,.Ldo_avx512_epilogue		# HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4224) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4225) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4226) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4227) open SELF,$0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4228) while(<SELF>) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4229) 	next if (/^#!/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4230) 	last if (!s/^#/\/\// and !/^$/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4231) 	print;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4232) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4233) close SELF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4234) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4235) foreach (split('\n',$code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4236) 	s/\`([^\`]*)\`/eval($1)/ge;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4237) 	s/%r([a-z]+)#d/%e$1/g;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4238) 	s/%r([0-9]+)#d/%r$1d/g;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4239) 	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4240) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4241) 	if ($kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4242) 		s/(^\.type.*),[0-9]+$/\1/;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4243) 		s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4244) 		next if /^\.cfi.*/;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4245) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4246) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4247) 	print $_,"\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4248) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4249) close STDOUT;