^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) #!/usr/bin/env perl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) # Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) # Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) # Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) # This code is taken from the OpenSSL project but the author, Andy Polyakov,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) # has relicensed it under the licenses specified in the SPDX header above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) # The original headers, including the original license headers, are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) # included below for completeness.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) # project. The module is, however, dual licensed under OpenSSL and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) # CRYPTOGAMS licenses depending on where you obtain it. For further
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) # details see http://www.openssl.org/~appro/cryptogams/.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) # ====================================================================
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) # This module implements Poly1305 hash for x86_64.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) # March 2015
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) # Initial release.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) # December 2016
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) # Add AVX512F+VL+BW code path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) # November 2017
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) # executed even on Knights Landing. Trigger for modification was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) # observation that AVX512 code paths can negatively affect overall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) # Skylake-X system performance. Since we are likely to suppress
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) # AVX512F capability flag [at least on Skylake-X], conversion serves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) # as kind of "investment protection". Note that next *lake processor,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) # Cannonlake, has AVX512IFMA code path to execute...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) # Numbers are cycles per processed byte with poly1305_blocks alone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) # measured with rdtsc at fixed clock frequency.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) # P4 4.46/+120% -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) # Core 2 2.41/+90% -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) # Westmere 1.88/+120% -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) # Sandy Bridge 1.39/+140% 1.10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) # Haswell 1.14/+175% 1.11 0.65
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) # Silvermont 2.83/+95% -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) # Knights L 3.60/? 1.65 1.10 0.41(***)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) # Goldmont 1.70/+180% -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) # VIA Nano 1.82/+150% -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) # Sledgehammer 1.38/+160% -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) # Bulldozer 2.30/+130% 0.97
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) # Ryzen 1.15/+200% 1.08 1.18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) # (*) improvement coefficients relative to clang are more modest and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) # are ~50% on most processors, in both cases we are comparing to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) # __int128 code;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) # (**) SSE2 implementation was attempted, but among non-AVX processors
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) # it was faster than integer-only code only on older Intel P4 and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) # Core processors, 50-30%, less newer processor is, but slower on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) # contemporary ones, for example almost 2x slower on Atom, and as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) # former are naturally disappearing, SSE2 is deemed unnecessary;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) # (***) strangely enough performance seems to vary from core to core,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) # listed result is best case;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) $flavour = shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) $output = shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) $kernel=0; $kernel=1 if (!$flavour && !$output);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) if (!$kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) die "can't locate x86_64-xlate.pl";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) *STDOUT=*OUT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) $avx += 1 if ($1==2.11 && $2>=8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) `ml64 2>&1` =~ /Version ([0-9]+)\./) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) $avx = ($1>=10) + ($1>=11);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) $avx = ($2>=3.0) + ($2>3.0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) $avx = 4; # The kernel uses ifdefs for this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) sub declare_function() {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) my ($name, $align, $nargs) = @_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) if($kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) $code .= ".align $align\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) $code .= "SYM_FUNC_START($name)\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) $code .= ".L$name:\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) $code .= ".globl $name\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) $code .= ".type $name,\@function,$nargs\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) $code .= ".align $align\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) $code .= "$name:\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) sub end_function() {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) my ($name) = @_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) if($kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) $code .= "SYM_FUNC_END($name)\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) $code .= ".size $name,.-$name\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) $code.=<<___ if $kernel;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) if ($avx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) $code.=<<___ if $kernel;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) .section .rodata
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) .align 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) .Lconst:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) .Lmask24:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) .L129:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) .Lmask26:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) .Lpermd_avx2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) .long 2,2,2,3,2,0,2,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) .Lpermd_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) .L2_44_inp_permd:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) .long 0,1,1,2,2,3,7,7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) .L2_44_inp_shift:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) .quad 0,12,24,64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) .L2_44_mask:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) .L2_44_shift_rgt:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) .quad 44,44,42,64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) .L2_44_shift_lft:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) .quad 8,8,10,64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) .align 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) .Lx_mask44:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) .Lx_mask42:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) $code.=<<___ if (!$kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) my ($mac,$nonce)=($inp,$len); # *_emit arguments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) sub poly1305_iteration {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) # output: $h0-$h2 *= $r0-$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) mulq $h0 # h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) mov %rax,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) mov $r0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) mov %rdx,$d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) mulq $h0 # h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) mov %rax,$h0 # future $h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) mov $r0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) mov %rdx,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) mulq $h1 # h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) add %rax,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) mov $s1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) adc %rdx,$d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) mulq $h1 # h1*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) mov $h2,$h1 # borrow $h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) add %rax,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) adc %rdx,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) imulq $s1,$h1 # h2*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) add $h1,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) mov $d1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) adc \$0,$d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) imulq $r0,$h2 # h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) add $d2,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) mov \$-4,%rax # mask value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) adc $h2,$d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) and $d3,%rax # last reduction step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) mov $d3,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) shr \$2,$d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) and \$3,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) add $d3,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) add %rax,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) adc \$0,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) adc \$0,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) # Layout of opaque area is following.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) # unsigned __int64 h[3]; # current hash value base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) # unsigned __int64 r[2]; # key value base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) $code.=<<___ if (!$kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) .extern OPENSSL_ia32cap_P
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) .globl poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) .hidden poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) .globl poly1305_blocks_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) .hidden poly1305_blocks_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) .globl poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) .hidden poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) &declare_function("poly1305_init_x86_64", 32, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) xor %eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) mov %rax,0($ctx) # initialize hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) mov %rax,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) mov %rax,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) cmp \$0,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) je .Lno_key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) $code.=<<___ if (!$kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) lea poly1305_blocks_x86_64(%rip),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) lea poly1305_emit_x86_64(%rip),%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) $code.=<<___ if (!$kernel && $avx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) mov OPENSSL_ia32cap_P+4(%rip),%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) lea poly1305_blocks_avx(%rip),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) lea poly1305_emit_avx(%rip),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) bt \$`60-32`,%r9 # AVX?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) cmovc %rax,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) cmovc %rcx,%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) $code.=<<___ if (!$kernel && $avx>1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) lea poly1305_blocks_avx2(%rip),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) bt \$`5+32`,%r9 # AVX2?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) cmovc %rax,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) $code.=<<___ if (!$kernel && $avx>3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) mov \$`(1<<31|1<<21|1<<16)`,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) shr \$32,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) and %rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) cmp %rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) je .Linit_base2_44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) mov \$0x0ffffffc0fffffff,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) mov \$0x0ffffffc0ffffffc,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) and 0($inp),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) and 8($inp),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) mov %rax,24($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) mov %rcx,32($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) $code.=<<___ if (!$kernel && $flavour !~ /elf32/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) mov %r10,0(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) mov %r11,8(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) $code.=<<___ if (!$kernel && $flavour =~ /elf32/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) mov %r10d,0(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) mov %r11d,4(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) mov \$1,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) .Lno_key:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) &end_function("poly1305_init_x86_64");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) &declare_function("poly1305_blocks_x86_64", 32, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) .Lblocks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) shr \$4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) jz .Lno_data # too short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) .cfi_push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) .cfi_push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) .cfi_push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) .cfi_push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) .cfi_push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) push $ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) .cfi_push $ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) .Lblocks_body:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) mov $len,%r15 # reassign $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) mov 24($ctx),$r0 # load r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) mov 32($ctx),$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) mov 0($ctx),$h0 # load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) mov 8($ctx),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) mov 16($ctx),$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) mov $s1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) shr \$2,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) mov $r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) add $r1,$s1 # s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) jmp .Loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) .Loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) add 0($inp),$h0 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) adc 8($inp),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) lea 16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) adc $padbit,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) &poly1305_iteration();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) mov $r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) dec %r15 # len-=16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) jnz .Loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) mov 0(%rsp),$ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) .cfi_restore $ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) mov $h0,0($ctx) # store hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) mov $h1,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) mov $h2,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) mov 8(%rsp),%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) .cfi_restore %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) mov 16(%rsp),%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) .cfi_restore %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) mov 24(%rsp),%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) .cfi_restore %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) mov 32(%rsp),%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) .cfi_restore %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) mov 40(%rsp),%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) .cfi_restore %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) lea 48(%rsp),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) .cfi_adjust_cfa_offset -48
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) .Lno_data:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) .Lblocks_epilogue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) &end_function("poly1305_blocks_x86_64");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) &declare_function("poly1305_emit_x86_64", 32, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) .Lemit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) mov 0($ctx),%r8 # load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) mov 8($ctx),%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) mov 16($ctx),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) mov %r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) add \$5,%r8 # compare to modulus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) mov %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) adc \$0,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) adc \$0,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) shr \$2,%r10 # did 130-bit value overflow?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) cmovnz %r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) cmovnz %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) add 0($nonce),%rax # accumulate nonce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) adc 8($nonce),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) mov %rax,0($mac) # write result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) mov %rcx,8($mac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) &end_function("poly1305_emit_x86_64");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) if ($avx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) # Layout of opaque area is following.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) # unsigned __int32 h[5]; # current hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) # unsigned __int32 is_base2_26;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) # unsigned __int64 r[2]; # key value base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) # unsigned __int64 pad;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) # where r^n are base 2^26 digits of degrees of multiplier key. There are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) # 5 digits, but last four are interleaved with multiples of 5, totalling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) map("%xmm$_",(0..15));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) .type __poly1305_block,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) __poly1305_block:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) push $ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) &poly1305_iteration();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) pop $ctx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) .size __poly1305_block,.-__poly1305_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) .type __poly1305_init_avx,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) __poly1305_init_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) mov %rsp,%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) mov $r0,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) mov $r1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) xor $h2,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) lea 48+64($ctx),$ctx # size optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) mov $r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) call __poly1305_block # r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) mov \$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) mov $h0,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) and $h0#d,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) mov $r0,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) and $r0#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) mov %eax,`16*0+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) shr \$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) mov %edx,`16*0+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) shr \$26,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) mov \$0x3ffffff,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) mov \$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) and $d1#d,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) and $d2#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) mov %eax,`16*1+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) lea (%rax,%rax,4),%eax # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) mov %edx,`16*1+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) lea (%rdx,%rdx,4),%edx # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) mov %eax,`16*2+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) shr \$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) mov %edx,`16*2+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) shr \$26,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) mov $h1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) mov $r1,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) shl \$12,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) shl \$12,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) or $d1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) or $d2,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) and \$0x3ffffff,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) and \$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) mov %eax,`16*3+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) lea (%rax,%rax,4),%eax # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) mov %edx,`16*3+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) lea (%rdx,%rdx,4),%edx # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) mov %eax,`16*4+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) mov $h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) mov %edx,`16*4+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) mov $r1,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) mov \$0x3ffffff,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) mov \$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) shr \$14,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) shr \$14,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) and $d1#d,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) and $d2#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) mov %eax,`16*5+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) lea (%rax,%rax,4),%eax # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) mov %edx,`16*5+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) lea (%rdx,%rdx,4),%edx # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) mov %eax,`16*6+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) shr \$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) mov %edx,`16*6+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) shr \$26,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) mov $h2,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) shl \$24,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) or %rax,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) mov $d1#d,`16*7+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) lea ($d1,$d1,4),$d1 # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) mov $d2#d,`16*7+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) lea ($d2,$d2,4),$d2 # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) mov $d1#d,`16*8+0-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) mov $d2#d,`16*8+4-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) mov $r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) call __poly1305_block # r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) mov \$0x3ffffff,%eax # save r^3 base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) mov $h0,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) and $h0#d,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) shr \$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) mov %eax,`16*0+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) mov \$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) and $d1#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) mov %edx,`16*1+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) lea (%rdx,%rdx,4),%edx # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) shr \$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) mov %edx,`16*2+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) mov $h1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) shl \$12,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) or $d1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) and \$0x3ffffff,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) mov %eax,`16*3+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) lea (%rax,%rax,4),%eax # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) mov $h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) mov %eax,`16*4+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) mov \$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) shr \$14,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) and $d1#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) mov %edx,`16*5+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) lea (%rdx,%rdx,4),%edx # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) shr \$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) mov %edx,`16*6+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) mov $h2,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) shl \$24,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) or %rax,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) mov $d1#d,`16*7+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) lea ($d1,$d1,4),$d1 # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) mov $d1#d,`16*8+12-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) mov $r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) call __poly1305_block # r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) mov \$0x3ffffff,%eax # save r^4 base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) mov $h0,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) and $h0#d,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) shr \$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) mov %eax,`16*0+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) mov \$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) and $d1#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) mov %edx,`16*1+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) lea (%rdx,%rdx,4),%edx # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) shr \$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) mov %edx,`16*2+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) mov $h1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) shl \$12,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) or $d1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) and \$0x3ffffff,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) mov %eax,`16*3+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) lea (%rax,%rax,4),%eax # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) mov $h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) mov %eax,`16*4+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) mov \$0x3ffffff,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) shr \$14,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) and $d1#d,%edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) mov %edx,`16*5+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) lea (%rdx,%rdx,4),%edx # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) shr \$26,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) mov %edx,`16*6+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) mov $h2,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) shl \$24,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) or %rax,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) mov $d1#d,`16*7+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) lea ($d1,$d1,4),$d1 # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) mov $d1#d,`16*8+8-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) lea -48-64($ctx),$ctx # size [de-]optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) pop %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) .size __poly1305_init_avx,.-__poly1305_init_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) &declare_function("poly1305_blocks_avx", 32, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) mov 20($ctx),%r8d # is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) cmp \$128,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) jae .Lblocks_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) test %r8d,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) jz .Lblocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) .Lblocks_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) and \$-16,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) jz .Lno_data_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) test %r8d,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) jz .Lbase2_64_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) test \$31,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) jz .Leven_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) .cfi_push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) mov %rsp,%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) .cfi_push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) .cfi_push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) .cfi_push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) .cfi_push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) .cfi_push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) .Lblocks_avx_body:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) mov $len,%r15 # reassign $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) mov 0($ctx),$d1 # load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) mov 8($ctx),$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) mov 16($ctx),$h2#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) mov 24($ctx),$r0 # load r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) mov 32($ctx),$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) ################################# base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) mov $d1#d,$h0#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) and \$`-1*(1<<31)`,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) mov $d2,$r1 # borrow $r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) mov $d2#d,$h1#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) and \$`-1*(1<<31)`,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) shr \$6,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) shl \$52,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) add $d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) shr \$12,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) shr \$18,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) add $r1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) adc $d2,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) mov $h2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) shl \$40,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) shr \$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) add $d1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) adc \$0,$h2 # can be partially reduced...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) mov \$-4,$d2 # ... so reduce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) mov $h2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) and $h2,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) shr \$2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) and \$3,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) add $d2,$d1 # =*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) add $d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) adc \$0,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) adc \$0,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) mov $s1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) mov $s1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) shr \$2,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) add $r1,$s1 # s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) add 0($inp),$h0 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) adc 8($inp),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) lea 16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) adc $padbit,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) call __poly1305_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) test $padbit,$padbit # if $padbit is zero,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) jz .Lstore_base2_64_avx # store hash in base 2^64 format
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) ################################# base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) mov $h0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) mov $h0,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) shr \$52,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) mov $h1,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) mov $h1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) shr \$26,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) and \$0x3ffffff,%rax # h[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) shl \$12,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) and \$0x3ffffff,%rdx # h[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) shr \$14,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) or $r0,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) shl \$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) and \$0x3ffffff,$h0 # h[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) shr \$40,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) and \$0x3ffffff,$h1 # h[3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) or $r1,$h2 # h[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) sub \$16,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) jz .Lstore_base2_26_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) vmovd %rax#d,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) vmovd %rdx#d,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) vmovd $h0#d,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) vmovd $h1#d,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) vmovd $h2#d,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) jmp .Lproceed_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) .Lstore_base2_64_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) mov $h0,0($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) mov $h1,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) mov $h2,16($ctx) # note that is_base2_26 is zeroed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) jmp .Ldone_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) .Lstore_base2_26_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) mov %rax#d,0($ctx) # store hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) mov %rdx#d,4($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) mov $h0#d,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) mov $h1#d,12($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) mov $h2#d,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) .Ldone_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) pop %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) .cfi_restore %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) pop %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) .cfi_restore %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) pop %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) .cfi_restore %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) pop %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) .cfi_restore %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) pop %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) .cfi_restore %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) pop %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) .cfi_restore %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) .Lno_data_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) .Lblocks_avx_epilogue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) .Lbase2_64_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) .cfi_push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) mov %rsp,%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) .cfi_push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) .cfi_push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) .cfi_push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) .cfi_push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) .cfi_push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) .Lbase2_64_avx_body:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) mov $len,%r15 # reassign $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) mov 24($ctx),$r0 # load r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) mov 32($ctx),$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) mov 0($ctx),$h0 # load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) mov 8($ctx),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) mov 16($ctx),$h2#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) mov $s1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) mov $s1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) shr \$2,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) add $r1,$s1 # s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) test \$31,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) jz .Linit_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) add 0($inp),$h0 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) adc 8($inp),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) lea 16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) adc $padbit,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) sub \$16,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) call __poly1305_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) .Linit_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) ################################# base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) mov $h0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) mov $h0,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) shr \$52,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) mov $h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) mov $h1,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) shr \$26,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) and \$0x3ffffff,%rax # h[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) shl \$12,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) and \$0x3ffffff,%rdx # h[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) shr \$14,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) or $d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) shl \$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) and \$0x3ffffff,$h0 # h[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) shr \$40,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) and \$0x3ffffff,$h1 # h[3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) or $d2,$h2 # h[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) vmovd %rax#d,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) vmovd %rdx#d,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) vmovd $h0#d,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) vmovd $h1#d,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) vmovd $h2#d,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) movl \$1,20($ctx) # set is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) call __poly1305_init_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) .Lproceed_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) mov %r15,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) pop %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) .cfi_restore %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) pop %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) .cfi_restore %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) pop %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) .cfi_restore %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) pop %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) .cfi_restore %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) pop %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) .cfi_restore %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) pop %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) .cfi_restore %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) .Lbase2_64_avx_epilogue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) jmp .Ldo_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) .Leven_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) vmovd 4*0($ctx),$H0 # load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) vmovd 4*1($ctx),$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) vmovd 4*2($ctx),$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) vmovd 4*3($ctx),$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) vmovd 4*4($ctx),$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) .Ldo_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) $code.=<<___ if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) lea 8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) .cfi_def_cfa_register %r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) and \$-32,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) sub \$-8,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) lea -0x58(%rsp),%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) sub \$0x178,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) $code.=<<___ if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) lea -0xf8(%rsp),%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) sub \$0x218,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) vmovdqa %xmm6,0x50(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) vmovdqa %xmm7,0x60(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) vmovdqa %xmm8,0x70(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) vmovdqa %xmm9,0x80(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) vmovdqa %xmm10,0x90(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) vmovdqa %xmm11,0xa0(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) vmovdqa %xmm12,0xb0(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) vmovdqa %xmm13,0xc0(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) vmovdqa %xmm14,0xd0(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) vmovdqa %xmm15,0xe0(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) .Ldo_avx_body:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) sub \$64,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) lea -32($inp),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) cmovc %rax,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) vmovdqu `16*3`($ctx),$D4 # preload r0^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) lea `16*3+64`($ctx),$ctx # size optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) lea .Lconst(%rip),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) # load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) vmovdqu 16*2($inp),$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) vmovdqu 16*3($inp),$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) vmovdqa 64(%rcx),$MASK # .Lmask26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) vpsrldq \$6,$T0,$T2 # splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) vpsrldq \$6,$T1,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) vpunpckhqdq $T1,$T0,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) vpunpcklqdq $T1,$T0,$T0 # 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) vpunpcklqdq $T3,$T2,$T3 # 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) vpsrlq \$40,$T4,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) vpsrlq \$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) vpand $MASK,$T0,$T0 # 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) vpsrlq \$4,$T3,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) vpand $MASK,$T1,$T1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) vpsrlq \$30,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) vpand $MASK,$T2,$T2 # 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) vpand $MASK,$T3,$T3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) vpor 32(%rcx),$T4,$T4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) jbe .Lskip_loop_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) # expand and copy pre-calculated table to stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) vmovdqu `16*1-64`($ctx),$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) vmovdqu `16*2-64`($ctx),$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) vmovdqa $D3,-0x90(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) vmovdqa $D0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) vpshufd \$0xEE,$D1,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) vmovdqu `16*3-64`($ctx),$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) vpshufd \$0x44,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) vmovdqa $D4,-0x80(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) vmovdqa $D1,0x10(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) vpshufd \$0xEE,$D2,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) vmovdqu `16*4-64`($ctx),$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) vpshufd \$0x44,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) vmovdqa $D3,-0x70(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) vmovdqa $D2,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) vpshufd \$0xEE,$D0,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) vmovdqu `16*5-64`($ctx),$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) vpshufd \$0x44,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) vmovdqa $D4,-0x60(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) vmovdqa $D0,0x30(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) vpshufd \$0xEE,$D1,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) vmovdqu `16*6-64`($ctx),$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) vpshufd \$0x44,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) vmovdqa $D3,-0x50(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) vmovdqa $D1,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) vpshufd \$0xEE,$D2,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) vmovdqu `16*7-64`($ctx),$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) vpshufd \$0x44,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) vmovdqa $D4,-0x40(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) vmovdqa $D2,0x50(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) vpshufd \$0xEE,$D0,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) vmovdqu `16*8-64`($ctx),$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) vpshufd \$0x44,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) vmovdqa $D3,-0x30(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) vmovdqa $D0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) vpshufd \$0xEE,$D1,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) vpshufd \$0x44,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) vmovdqa $D4,-0x20(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) vmovdqa $D1,0x70(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) vpshufd \$0xEE,$D2,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) vmovdqa 0x00(%rsp),$D4 # preload r0^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) vpshufd \$0x44,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) vmovdqa $D3,-0x10(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) vmovdqa $D2,0x80(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) jmp .Loop_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) .Loop_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) # \___________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) # \___________________/ \____________________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) # Note that we start with inp[2:3]*r^2. This is because it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) # doesn't depend on reduction in previous iteration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) # though note that $Tx and $Hx are "reversed" in this section,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) # and $D4 is preloaded with r0^2...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) vpmuludq $T0,$D4,$D0 # d0 = h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) vpmuludq $T1,$D4,$D1 # d1 = h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) vmovdqa $H2,0x20(%r11) # offload hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) vpmuludq $T2,$D4,$D2 # d3 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) vmovdqa 0x10(%rsp),$H2 # r1^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) vpmuludq $T3,$D4,$D3 # d3 = h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) vpmuludq $T4,$D4,$D4 # d4 = h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) vmovdqa $H0,0x00(%r11) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) vmovdqa $H1,0x10(%r11) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) vpmuludq $T3,$H2,$H1 # h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) vpaddq $H0,$D0,$D0 # d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) vpaddq $H1,$D4,$D4 # d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) vmovdqa $H3,0x30(%r11) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) vpmuludq $T2,$H2,$H0 # h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) vpmuludq $T1,$H2,$H1 # h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) vpaddq $H0,$D3,$D3 # d3 += h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) vmovdqa 0x30(%rsp),$H3 # r2^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) vpaddq $H1,$D2,$D2 # d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) vmovdqa $H4,0x40(%r11) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) vpmuludq $T0,$H2,$H2 # h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) vpmuludq $T2,$H3,$H0 # h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) vpaddq $H2,$D1,$D1 # d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) vmovdqa 0x40(%rsp),$H4 # s2^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) vpaddq $H0,$D4,$D4 # d4 += h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) vpmuludq $T1,$H3,$H1 # h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) vpmuludq $T0,$H3,$H3 # h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) vpaddq $H1,$D3,$D3 # d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) vmovdqa 0x50(%rsp),$H2 # r3^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) vpaddq $H3,$D2,$D2 # d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) vpmuludq $T4,$H4,$H0 # h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) vpmuludq $T3,$H4,$H4 # h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) vpaddq $H0,$D1,$D1 # d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) vmovdqa 0x60(%rsp),$H3 # s3^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) vpaddq $H4,$D0,$D0 # d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) vmovdqa 0x80(%rsp),$H4 # s4^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) vpmuludq $T1,$H2,$H1 # h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) vpmuludq $T0,$H2,$H2 # h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) vpaddq $H1,$D4,$D4 # d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) vpaddq $H2,$D3,$D3 # d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) vpmuludq $T4,$H3,$H0 # h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) vpmuludq $T3,$H3,$H1 # h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) vpaddq $H0,$D2,$D2 # d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) vmovdqu 16*0($inp),$H0 # load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) vpaddq $H1,$D1,$D1 # d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) vpmuludq $T2,$H3,$H3 # h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) vpmuludq $T2,$H4,$T2 # h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) vpaddq $H3,$D0,$D0 # d0 += h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) vmovdqu 16*1($inp),$H1 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) vpaddq $T2,$D1,$D1 # d1 += h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) vpmuludq $T3,$H4,$T3 # h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) vpmuludq $T4,$H4,$T4 # h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) vpsrldq \$6,$H0,$H2 # splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) vpaddq $T3,$D2,$D2 # d2 += h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) vpaddq $T4,$D3,$D3 # d3 += h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) vpsrldq \$6,$H1,$H3 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) vpmuludq $T1,$H4,$T0 # h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) vpunpckhqdq $H1,$H0,$H4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) vpaddq $T4,$D4,$D4 # d4 += h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) vmovdqa -0x90(%r11),$T4 # r0^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) vpaddq $T0,$D0,$D0 # d0 += h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) vpunpcklqdq $H1,$H0,$H0 # 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) vpunpcklqdq $H3,$H2,$H3 # 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) #vpsrlq \$40,$H4,$H4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) vpsrldq \$`40/8`,$H4,$H4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) vpsrlq \$26,$H0,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) vpand $MASK,$H0,$H0 # 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) vpsrlq \$4,$H3,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) vpand $MASK,$H1,$H1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) vpand 0(%rcx),$H4,$H4 # .Lmask24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) vpsrlq \$30,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) vpand $MASK,$H2,$H2 # 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) vpand $MASK,$H3,$H3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) vpor 32(%rcx),$H4,$H4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) vpaddq 0x00(%r11),$H0,$H0 # add hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) vpaddq 0x10(%r11),$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) vpaddq 0x20(%r11),$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) vpaddq 0x30(%r11),$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) vpaddq 0x40(%r11),$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) lea 16*2($inp),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) lea 16*4($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) sub \$64,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) cmovc %rax,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) # Now we accumulate (inp[0:1]+hash)*r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) vpmuludq $H0,$T4,$T0 # h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) vpmuludq $H1,$T4,$T1 # h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) vpaddq $T0,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) vpaddq $T1,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) vmovdqa -0x80(%r11),$T2 # r1^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) vpmuludq $H2,$T4,$T0 # h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) vpmuludq $H3,$T4,$T1 # h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) vpaddq $T0,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) vpaddq $T1,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) vpmuludq $H4,$T4,$T4 # h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) vpaddq $T4,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) vpaddq $T0,$D0,$D0 # d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) vpmuludq $H2,$T2,$T1 # h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) vpmuludq $H3,$T2,$T0 # h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) vpaddq $T1,$D3,$D3 # d3 += h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) vmovdqa -0x60(%r11),$T3 # r2^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) vpaddq $T0,$D4,$D4 # d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) vpmuludq $H1,$T2,$T1 # h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) vpmuludq $H0,$T2,$T2 # h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) vpaddq $T1,$D2,$D2 # d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) vpaddq $T2,$D1,$D1 # d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) vmovdqa -0x50(%r11),$T4 # s2^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) vpmuludq $H2,$T3,$T0 # h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) vpmuludq $H1,$T3,$T1 # h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) vpaddq $T0,$D4,$D4 # d4 += h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) vpaddq $T1,$D3,$D3 # d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) vmovdqa -0x40(%r11),$T2 # r3^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) vpmuludq $H0,$T3,$T3 # h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) vpmuludq $H4,$T4,$T0 # h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) vpaddq $T3,$D2,$D2 # d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) vpaddq $T0,$D1,$D1 # d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) vmovdqa -0x30(%r11),$T3 # s3^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) vpmuludq $H3,$T4,$T4 # h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) vpmuludq $H1,$T2,$T1 # h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) vpaddq $T4,$D0,$D0 # d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) vmovdqa -0x10(%r11),$T4 # s4^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) vpaddq $T1,$D4,$D4 # d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) vpmuludq $H0,$T2,$T2 # h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) vpmuludq $H4,$T3,$T0 # h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) vpaddq $T2,$D3,$D3 # d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) vpaddq $T0,$D2,$D2 # d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) vmovdqu 16*2($inp),$T0 # load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) vpmuludq $H3,$T3,$T2 # h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) vpmuludq $H2,$T3,$T3 # h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) vpaddq $T2,$D1,$D1 # d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) vmovdqu 16*3($inp),$T1 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) vpaddq $T3,$D0,$D0 # d0 += h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) vpmuludq $H2,$T4,$H2 # h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) vpmuludq $H3,$T4,$H3 # h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) vpsrldq \$6,$T0,$T2 # splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) vpaddq $H2,$D1,$D1 # d1 += h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) vpmuludq $H4,$T4,$H4 # h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) vpsrldq \$6,$T1,$T3 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) vpmuludq $H1,$T4,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) vpunpckhqdq $T1,$T0,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) vpunpcklqdq $T1,$T0,$T0 # 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) vpunpcklqdq $T3,$T2,$T3 # 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) #vpsrlq \$40,$T4,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) vpsrldq \$`40/8`,$T4,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) vpsrlq \$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) vmovdqa 0x00(%rsp),$D4 # preload r0^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) vpand $MASK,$T0,$T0 # 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) vpsrlq \$4,$T3,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) vpand $MASK,$T1,$T1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) vpand 0(%rcx),$T4,$T4 # .Lmask24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) vpsrlq \$30,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) vpand $MASK,$T2,$T2 # 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) vpand $MASK,$T3,$T3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) vpor 32(%rcx),$T4,$T4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) # and P. Schwabe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) vpsrlq \$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) vpand $MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) vpaddq $D3,$H4,$H4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) vpsrlq \$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) vpand $MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) vpaddq $D0,$D1,$H1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) vpsrlq \$26,$H4,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) vpand $MASK,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) vpsrlq \$26,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) vpand $MASK,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) vpaddq $D1,$H2,$H2 # h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) vpaddq $D0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) vpsllq \$2,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) vpaddq $D0,$H0,$H0 # h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) vpsrlq \$26,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) vpand $MASK,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) vpaddq $D2,$H3,$H3 # h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) vpsrlq \$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) vpand $MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) vpaddq $D0,$H1,$H1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) vpsrlq \$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) vpand $MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) vpaddq $D3,$H4,$H4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) ja .Loop_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) .Lskip_loop_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) add \$32,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) jnz .Long_tail_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) vpaddq $H2,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) vpaddq $H0,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) vpaddq $H1,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) vpaddq $H3,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) vpaddq $H4,$T4,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) .Long_tail_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) vmovdqa $H2,0x20(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) vmovdqa $H0,0x00(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) vmovdqa $H1,0x10(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) vmovdqa $H3,0x30(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) vmovdqa $H4,0x40(%r11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) vpmuludq $T2,$D4,$D2 # d2 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) vpmuludq $T0,$D4,$D0 # d0 = h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) vpmuludq $T1,$D4,$D1 # d1 = h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) vpmuludq $T3,$D4,$D3 # d3 = h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) vpmuludq $T4,$D4,$D4 # d4 = h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) vpmuludq $T3,$H2,$H0 # h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) vpaddq $H0,$D4,$D4 # d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) vpmuludq $T2,$H2,$H1 # h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) vpaddq $H1,$D3,$D3 # d3 += h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) vpmuludq $T1,$H2,$H0 # h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) vpaddq $H0,$D2,$D2 # d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) vpmuludq $T0,$H2,$H2 # h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) vpaddq $H2,$D1,$D1 # d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) vpmuludq $T4,$H3,$H3 # h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) vpaddq $H3,$D0,$D0 # d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) vpmuludq $T2,$H4,$H1 # h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) vpaddq $H1,$D4,$D4 # d4 += h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) vpmuludq $T1,$H4,$H0 # h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) vpaddq $H0,$D3,$D3 # d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) vpmuludq $T0,$H4,$H4 # h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) vpaddq $H4,$D2,$D2 # d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) vpmuludq $T4,$H2,$H1 # h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) vpaddq $H1,$D1,$D1 # d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) vpmuludq $T3,$H2,$H2 # h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) vpaddq $H2,$D0,$D0 # d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) vpmuludq $T1,$H3,$H0 # h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) vpaddq $H0,$D4,$D4 # d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) vpmuludq $T0,$H3,$H3 # h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) vpaddq $H3,$D3,$D3 # d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) vpmuludq $T4,$H4,$H1 # h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) vpaddq $H1,$D2,$D2 # d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) vpmuludq $T3,$H4,$H0 # h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) vpaddq $H0,$D1,$D1 # d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) vpmuludq $T2,$H4,$H4 # h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) vpaddq $H4,$D0,$D0 # d0 += h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) vpmuludq $T0,$H2,$H2 # h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) vpmuludq $T4,$H3,$H1 # h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) vpmuludq $T3,$H3,$H0 # h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) vpmuludq $T2,$H3,$H1 # h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) vpmuludq $T1,$H3,$H3 # h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) jz .Lshort_tail_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) vmovdqu 16*0($inp),$H0 # load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) vmovdqu 16*1($inp),$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) vpsrldq \$6,$H0,$H2 # splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) vpsrldq \$6,$H1,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) vpunpckhqdq $H1,$H0,$H4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) vpunpcklqdq $H1,$H0,$H0 # 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) vpunpcklqdq $H3,$H2,$H3 # 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) vpsrlq \$40,$H4,$H4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) vpsrlq \$26,$H0,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) vpand $MASK,$H0,$H0 # 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) vpsrlq \$4,$H3,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) vpand $MASK,$H1,$H1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) vpsrlq \$30,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) vpand $MASK,$H2,$H2 # 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) vpand $MASK,$H3,$H3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) vpor 32(%rcx),$H4,$H4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) vpaddq 0x00(%r11),$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) vpaddq 0x10(%r11),$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) vpaddq 0x20(%r11),$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) vpaddq 0x30(%r11),$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) vpaddq 0x40(%r11),$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) vpmuludq $H0,$T4,$T0 # h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) vpaddq $T0,$D0,$D0 # d0 += h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) vpmuludq $H1,$T4,$T1 # h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) vpaddq $T1,$D1,$D1 # d1 += h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) vpmuludq $H2,$T4,$T0 # h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) vpaddq $T0,$D2,$D2 # d2 += h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) vpmuludq $H3,$T4,$T1 # h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) vpaddq $T1,$D3,$D3 # d3 += h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) vpmuludq $H4,$T4,$T4 # h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) vpaddq $T4,$D4,$D4 # d4 += h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) vpmuludq $H3,$T2,$T0 # h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) vpaddq $T0,$D4,$D4 # d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) vpmuludq $H2,$T2,$T1 # h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) vpaddq $T1,$D3,$D3 # d3 += h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) vpmuludq $H1,$T2,$T0 # h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) vpaddq $T0,$D2,$D2 # d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) vpmuludq $H0,$T2,$T2 # h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) vpaddq $T2,$D1,$D1 # d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) vpmuludq $H4,$T3,$T3 # h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) vpaddq $T3,$D0,$D0 # d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) vpmuludq $H2,$T4,$T1 # h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) vpaddq $T1,$D4,$D4 # d4 += h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) vpmuludq $H1,$T4,$T0 # h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) vpaddq $T0,$D3,$D3 # d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) vpmuludq $H0,$T4,$T4 # h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) vpaddq $T4,$D2,$D2 # d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) vpmuludq $H4,$T2,$T1 # h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) vpaddq $T1,$D1,$D1 # d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) vpmuludq $H3,$T2,$T2 # h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) vpaddq $T2,$D0,$D0 # d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) vpmuludq $H1,$T3,$T0 # h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) vpaddq $T0,$D4,$D4 # d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) vpmuludq $H0,$T3,$T3 # h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) vpaddq $T3,$D3,$D3 # d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) vpmuludq $H4,$T4,$T1 # h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) vpaddq $T1,$D2,$D2 # d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) vpmuludq $H3,$T4,$T0 # h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) vpaddq $T0,$D1,$D1 # d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) vpmuludq $H2,$T4,$T4 # h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) vpaddq $T4,$D0,$D0 # d0 += h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) vpmuludq $H0,$T2,$T2 # h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) vpaddq $T2,$D4,$D4 # d4 += h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) vpmuludq $H4,$T3,$T1 # h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) vpaddq $T1,$D3,$D3 # d3 += h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) vpmuludq $H3,$T3,$T0 # h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) vpaddq $T0,$D2,$D2 # d2 += h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) vpmuludq $H2,$T3,$T1 # h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) vpaddq $T1,$D1,$D1 # d1 += h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) vpmuludq $H1,$T3,$T3 # h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) vpaddq $T3,$D0,$D0 # d0 += h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) .Lshort_tail_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) # horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) vpsrldq \$8,$D4,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) vpsrldq \$8,$D3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) vpsrldq \$8,$D1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) vpsrldq \$8,$D0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) vpsrldq \$8,$D2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) vpaddq $T3,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) vpaddq $T4,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) vpaddq $T0,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) vpaddq $T1,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) vpaddq $T2,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) # lazy reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) vpsrlq \$26,$D3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) vpand $MASK,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) vpaddq $H3,$D4,$D4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) vpsrlq \$26,$D0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) vpand $MASK,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) vpaddq $H0,$D1,$D1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) vpsrlq \$26,$D4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) vpand $MASK,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) vpsrlq \$26,$D1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) vpand $MASK,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) vpaddq $H1,$D2,$D2 # h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) vpaddq $H4,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) vpsllq \$2,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) vpaddq $H4,$D0,$D0 # h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) vpsrlq \$26,$D2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) vpand $MASK,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) vpaddq $H2,$D3,$D3 # h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) vpsrlq \$26,$D0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) vpand $MASK,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) vpaddq $H0,$D1,$D1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) vpsrlq \$26,$D3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) vpand $MASK,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) vpaddq $H3,$D4,$D4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) vmovd $D1,`4*1-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) vmovd $D2,`4*2-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) vmovd $D3,`4*3-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) vmovd $D4,`4*4-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) $code.=<<___ if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) vmovdqa 0x50(%r11),%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) vmovdqa 0x60(%r11),%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) vmovdqa 0x70(%r11),%xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) vmovdqa 0x80(%r11),%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) vmovdqa 0x90(%r11),%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) vmovdqa 0xa0(%r11),%xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) vmovdqa 0xb0(%r11),%xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) vmovdqa 0xc0(%r11),%xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) vmovdqa 0xd0(%r11),%xmm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) vmovdqa 0xe0(%r11),%xmm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) lea 0xf8(%r11),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) .Ldo_avx_epilogue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) $code.=<<___ if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) lea -8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) .cfi_def_cfa_register %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) &end_function("poly1305_blocks_avx");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) &declare_function("poly1305_emit_avx", 32, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) cmpl \$0,20($ctx) # is_base2_26?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) je .Lemit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) mov 0($ctx),%eax # load hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) mov 4($ctx),%ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) mov 8($ctx),%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) mov 12($ctx),%r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) mov 16($ctx),%r10d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) shl \$26,%rcx # base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) mov %r8,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) shl \$52,%r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) add %rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) shr \$12,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) add %rax,%r8 # h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) adc \$0,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) shl \$14,%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) mov %r10,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) shr \$24,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) add %r11,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) shl \$40,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) add %rax,%r9 # h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) adc \$0,%r10 # h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) mov %r10,%rax # could be partially reduced, so reduce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) mov %r10,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) and \$3,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) shr \$2,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) and \$-4,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) add %rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) add %rax,%r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) adc \$0,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) adc \$0,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) mov %r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) add \$5,%r8 # compare to modulus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) mov %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) adc \$0,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) adc \$0,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) shr \$2,%r10 # did 130-bit value overflow?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) cmovnz %r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) cmovnz %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) add 0($nonce),%rax # accumulate nonce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) adc 8($nonce),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) mov %rax,0($mac) # write result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) mov %rcx,8($mac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) &end_function("poly1305_emit_avx");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) if ($avx>1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) map("%ymm$_",(0..15));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) my $S4=$MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) sub poly1305_blocks_avxN {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) my ($avx512) = @_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) my $suffix = $avx512 ? "_avx512" : "";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) mov 20($ctx),%r8d # is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) cmp \$128,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) jae .Lblocks_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) test %r8d,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) jz .Lblocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) .Lblocks_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) and \$-16,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) jz .Lno_data_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) test %r8d,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) jz .Lbase2_64_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) test \$63,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) jz .Leven_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) .cfi_push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) mov %rsp,%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) .cfi_push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) .cfi_push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) .cfi_push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) .cfi_push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) .cfi_push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) .Lblocks_avx2_body$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) mov $len,%r15 # reassign $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) mov 0($ctx),$d1 # load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) mov 8($ctx),$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) mov 16($ctx),$h2#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) mov 24($ctx),$r0 # load r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) mov 32($ctx),$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) ################################# base 2^26 -> base 2^64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) mov $d1#d,$h0#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) and \$`-1*(1<<31)`,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) mov $d2,$r1 # borrow $r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) mov $d2#d,$h1#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) and \$`-1*(1<<31)`,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) shr \$6,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) shl \$52,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) add $d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) shr \$12,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) shr \$18,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) add $r1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) adc $d2,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) mov $h2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) shl \$40,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) shr \$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) add $d1,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) adc \$0,$h2 # can be partially reduced...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) mov \$-4,$d2 # ... so reduce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) mov $h2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) and $h2,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) shr \$2,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) and \$3,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) add $d2,$d1 # =*5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) add $d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) adc \$0,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) adc \$0,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) mov $s1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) mov $s1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) shr \$2,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) add $r1,$s1 # s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) .Lbase2_26_pre_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) add 0($inp),$h0 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) adc 8($inp),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) lea 16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) adc $padbit,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) sub \$16,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) call __poly1305_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) mov $r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) test \$63,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) jnz .Lbase2_26_pre_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) test $padbit,$padbit # if $padbit is zero,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) ################################# base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) mov $h0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) mov $h0,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) shr \$52,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) mov $h1,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) mov $h1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) shr \$26,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) and \$0x3ffffff,%rax # h[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) shl \$12,$r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) and \$0x3ffffff,%rdx # h[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) shr \$14,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) or $r0,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) shl \$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) and \$0x3ffffff,$h0 # h[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) shr \$40,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) and \$0x3ffffff,$h1 # h[3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) or $r1,$h2 # h[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) test %r15,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) jz .Lstore_base2_26_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) vmovd %rax#d,%x#$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) vmovd %rdx#d,%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) vmovd $h0#d,%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) vmovd $h1#d,%x#$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) vmovd $h2#d,%x#$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) jmp .Lproceed_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) .Lstore_base2_64_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) mov $h0,0($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) mov $h1,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) mov $h2,16($ctx) # note that is_base2_26 is zeroed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) jmp .Ldone_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) .Lstore_base2_26_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) mov %rax#d,0($ctx) # store hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) mov %rdx#d,4($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) mov $h0#d,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) mov $h1#d,12($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) mov $h2#d,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) .Ldone_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) pop %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) .cfi_restore %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) pop %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) .cfi_restore %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) pop %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) .cfi_restore %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) pop %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) .cfi_restore %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) pop %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) .cfi_restore %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) pop %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) .cfi_restore %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) .Lno_data_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) .Lblocks_avx2_epilogue$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) .Lbase2_64_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) .cfi_push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) mov %rsp,%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) .cfi_push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) .cfi_push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) .cfi_push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) .cfi_push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) .cfi_push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) .Lbase2_64_avx2_body$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) mov $len,%r15 # reassign $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) mov 24($ctx),$r0 # load r
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) mov 32($ctx),$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) mov 0($ctx),$h0 # load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) mov 8($ctx),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) mov 16($ctx),$h2#d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) mov $s1,$r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) mov $s1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) shr \$2,$s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) add $r1,$s1 # s1 = r1 + (r1 >> 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) test \$63,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) jz .Linit_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) .Lbase2_64_pre_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) add 0($inp),$h0 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) adc 8($inp),$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) lea 16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) adc $padbit,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) sub \$16,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) call __poly1305_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) mov $r1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) test \$63,%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) jnz .Lbase2_64_pre_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) .Linit_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) ################################# base 2^64 -> base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) mov $h0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) mov $h0,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) shr \$52,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) mov $h1,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) mov $h1,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) shr \$26,%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) and \$0x3ffffff,%rax # h[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) shl \$12,$d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) and \$0x3ffffff,%rdx # h[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) shr \$14,$h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) or $d1,$h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) shl \$24,$h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) and \$0x3ffffff,$h0 # h[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) shr \$40,$d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) and \$0x3ffffff,$h1 # h[3]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) or $d2,$h2 # h[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) vmovd %rax#d,%x#$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) vmovd %rdx#d,%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) vmovd $h0#d,%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) vmovd $h1#d,%x#$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) vmovd $h2#d,%x#$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) movl \$1,20($ctx) # set is_base2_26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) call __poly1305_init_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) .Lproceed_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) mov %r15,$len # restore $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) $code.=<<___ if (!$kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) mov OPENSSL_ia32cap_P+8(%rip),%r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) mov \$`(1<<31|1<<30|1<<16)`,%r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) pop %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) .cfi_restore %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) pop %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) .cfi_restore %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) pop %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) .cfi_restore %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) pop %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) .cfi_restore %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) pop %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) .cfi_restore %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) pop %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) .cfi_restore %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) .Lbase2_64_avx2_epilogue$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) jmp .Ldo_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) .Leven_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) $code.=<<___ if (!$kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) mov OPENSSL_ia32cap_P+8(%rip),%r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) vmovd 4*1($ctx),%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) vmovd 4*2($ctx),%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) vmovd 4*3($ctx),%x#$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) vmovd 4*4($ctx),%x#$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) .Ldo_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) $code.=<<___ if (!$kernel && $avx>2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) cmp \$512,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) jb .Lskip_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) and %r11d,%r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) test \$`1<<16`,%r9d # check for AVX512F
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) jnz .Lblocks_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) .Lskip_avx512$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) $code.=<<___ if ($avx > 2 && $avx512 && $kernel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) cmp \$512,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) jae .Lblocks_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) $code.=<<___ if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) lea 8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) .cfi_def_cfa_register %r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) sub \$0x128,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) $code.=<<___ if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) lea 8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) sub \$0x1c8,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) vmovdqa %xmm6,-0xb0(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) vmovdqa %xmm7,-0xa0(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) vmovdqa %xmm8,-0x90(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) vmovdqa %xmm9,-0x80(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) vmovdqa %xmm10,-0x70(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) vmovdqa %xmm11,-0x60(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) vmovdqa %xmm12,-0x50(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) vmovdqa %xmm13,-0x40(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) vmovdqa %xmm14,-0x30(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) vmovdqa %xmm15,-0x20(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) .Ldo_avx2_body$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) lea .Lconst(%rip),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) lea 48+64($ctx),$ctx # size optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) # expand and copy pre-calculated table to stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) vmovdqu `16*0-64`($ctx),%x#$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) and \$-512,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) vmovdqu `16*1-64`($ctx),%x#$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) vmovdqu `16*2-64`($ctx),%x#$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) vmovdqu `16*3-64`($ctx),%x#$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) vmovdqu `16*4-64`($ctx),%x#$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) vmovdqu `16*5-64`($ctx),%x#$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) lea 0x90(%rsp),%rax # size optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) vmovdqu `16*6-64`($ctx),%x#$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) vpermd $T2,$T0,$T2 # 00003412 -> 14243444
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) vmovdqu `16*7-64`($ctx),%x#$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) vpermd $T3,$T0,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) vmovdqu `16*8-64`($ctx),%x#$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) vpermd $T4,$T0,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) vmovdqa $T2,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) vpermd $D0,$T0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) vmovdqa $T3,0x20-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) vpermd $D1,$T0,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) vmovdqa $T4,0x40-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) vpermd $D2,$T0,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) vmovdqa $D0,0x60-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) vpermd $D3,$T0,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) vmovdqa $D1,0x80-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) vpermd $D4,$T0,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) vmovdqa $D2,0xa0-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) vpermd $MASK,$T0,$MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) vmovdqa $D3,0xc0-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) vmovdqa $D4,0xe0-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) vmovdqa $MASK,0x100-0x90(%rax)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) vmovdqa 64(%rcx),$MASK # .Lmask26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) # load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) vmovdqu 16*0($inp),%x#$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) vmovdqu 16*1($inp),%x#$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) vinserti128 \$1,16*2($inp),$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) vinserti128 \$1,16*3($inp),$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) lea 16*4($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) vpsrldq \$6,$T0,$T2 # splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) vpsrldq \$6,$T1,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) vpunpckhqdq $T1,$T0,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) vpunpcklqdq $T3,$T2,$T2 # 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) vpunpcklqdq $T1,$T0,$T0 # 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) vpsrlq \$30,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) vpsrlq \$4,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) vpsrlq \$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) vpsrlq \$40,$T4,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) vpand $MASK,$T2,$T2 # 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) vpand $MASK,$T0,$T0 # 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) vpand $MASK,$T1,$T1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) vpand $MASK,$T3,$T3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) vpor 32(%rcx),$T4,$T4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) vpaddq $H2,$T2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) sub \$64,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) jz .Ltail_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) jmp .Loop_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) .Loop_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) # \________/\__________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) #vpaddq $H2,$T2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) vpaddq $H0,$T0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) vmovdqa `32*0`(%rsp),$T0 # r0^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) vpaddq $H1,$T1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) vmovdqa `32*1`(%rsp),$T1 # r1^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) vpaddq $H3,$T3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) vmovdqa `32*3`(%rsp),$T2 # r2^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) vpaddq $H4,$T4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) # however, as h2 is "chronologically" first one available pull
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) # corresponding operations up, so it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) vpmuludq $H2,$T0,$D2 # d2 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) vpmuludq $H2,$T1,$D3 # d3 = h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) vpmuludq $H2,$T2,$D4 # d4 = h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) vpmuludq $H2,$T3,$D0 # d0 = h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) vpmuludq $H2,$S4,$D1 # d1 = h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) vpmuludq $H0,$T1,$T4 # h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) vpaddq $T4,$D1,$D1 # d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) vpaddq $H2,$D2,$D2 # d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) vpmuludq $H3,$T1,$T4 # h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) vpaddq $T4,$D4,$D4 # d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) vpaddq $H2,$D0,$D0 # d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) vmovdqa `32*4-0x90`(%rax),$T1 # s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) vpmuludq $H0,$T0,$T4 # h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) vpmuludq $H1,$T0,$H2 # h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) vpaddq $T4,$D0,$D0 # d0 += h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) vpaddq $H2,$D1,$D1 # d1 += h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) vpmuludq $H3,$T0,$T4 # h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) vpmuludq $H4,$T0,$H2 # h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) vmovdqu 16*0($inp),%x#$T0 # load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) vpaddq $T4,$D3,$D3 # d3 += h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) vpaddq $H2,$D4,$D4 # d4 += h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) vinserti128 \$1,16*2($inp),$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) vpmuludq $H3,$T1,$T4 # h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) vpmuludq $H4,$T1,$H2 # h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) vmovdqu 16*1($inp),%x#$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) vpaddq $T4,$D0,$D0 # d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) vpaddq $H2,$D1,$D1 # d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) vmovdqa `32*5-0x90`(%rax),$H2 # r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) vpmuludq $H1,$T2,$T4 # h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) vpmuludq $H0,$T2,$T2 # h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) vpaddq $T4,$D3,$D3 # d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) vpaddq $T2,$D2,$D2 # d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) vinserti128 \$1,16*3($inp),$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) lea 16*4($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) vpmuludq $H1,$H2,$T4 # h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) vpmuludq $H0,$H2,$H2 # h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) vpsrldq \$6,$T0,$T2 # splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) vpaddq $T4,$D4,$D4 # d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) vpaddq $H2,$D3,$D3 # d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) vpmuludq $H3,$T3,$T4 # h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) vpmuludq $H4,$T3,$H2 # h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) vpsrldq \$6,$T1,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) vpaddq $T4,$D1,$D1 # d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) vpaddq $H2,$D2,$D2 # d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) vpunpckhqdq $T1,$T0,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) vpmuludq $H3,$S4,$H3 # h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) vpmuludq $H4,$S4,$H4 # h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) vpunpcklqdq $T1,$T0,$T0 # 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) vpunpcklqdq $T3,$T2,$T3 # 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) vpmuludq $H1,$S4,$H0 # h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) vmovdqa 64(%rcx),$MASK # .Lmask26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) # lazy reduction (interleaved with tail of input splat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) vpsrlq \$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) vpand $MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) vpaddq $D3,$H4,$H4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) vpsrlq \$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) vpand $MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) vpaddq $D0,$D1,$H1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) vpsrlq \$26,$H4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) vpand $MASK,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) vpsrlq \$4,$T3,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) vpsrlq \$26,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) vpand $MASK,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) vpaddq $D1,$H2,$H2 # h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) vpaddq $D4,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) vpsllq \$2,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) vpaddq $D4,$H0,$H0 # h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) vpand $MASK,$T2,$T2 # 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) vpsrlq \$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) vpsrlq \$26,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) vpand $MASK,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) vpaddq $D2,$H3,$H3 # h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) vpaddq $T2,$H2,$H2 # modulo-scheduled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) vpsrlq \$30,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) vpsrlq \$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) vpand $MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) vpaddq $D0,$H1,$H1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) vpsrlq \$40,$T4,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) vpsrlq \$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) vpand $MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) vpaddq $D3,$H4,$H4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) vpand $MASK,$T0,$T0 # 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) vpand $MASK,$T1,$T1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) vpand $MASK,$T3,$T3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) vpor 32(%rcx),$T4,$T4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) sub \$64,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) jnz .Loop_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) .byte 0x66,0x90
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) .Ltail_avx2$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) # while above multiplications were by r^4 in all lanes, in last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) # iteration we multiply least significant lane by r^4 and most
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) # significant one by r, so copy of above except that references
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) # to the precomputed table are displaced by 4...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) #vpaddq $H2,$T2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) vpaddq $H0,$T0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) vmovdqu `32*0+4`(%rsp),$T0 # r0^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) vpaddq $H1,$T1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) vmovdqu `32*1+4`(%rsp),$T1 # r1^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) vpaddq $H3,$T3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) vmovdqu `32*3+4`(%rsp),$T2 # r2^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) vpaddq $H4,$T4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) vpmuludq $H2,$T0,$D2 # d2 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) vpmuludq $H2,$T1,$D3 # d3 = h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) vpmuludq $H2,$T2,$D4 # d4 = h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) vpmuludq $H2,$T3,$D0 # d0 = h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) vpmuludq $H2,$S4,$D1 # d1 = h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) vpmuludq $H0,$T1,$T4 # h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) vpmuludq $H1,$T1,$H2 # h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) vpaddq $T4,$D1,$D1 # d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) vpaddq $H2,$D2,$D2 # d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) vpmuludq $H3,$T1,$T4 # h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) vpaddq $T4,$D4,$D4 # d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) vpaddq $H2,$D0,$D0 # d0 += h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) vpmuludq $H0,$T0,$T4 # h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) vpmuludq $H1,$T0,$H2 # h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) vpaddq $T4,$D0,$D0 # d0 += h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) vpaddq $H2,$D1,$D1 # d1 += h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) vpmuludq $H3,$T0,$T4 # h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) vpmuludq $H4,$T0,$H2 # h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) vpaddq $T4,$D3,$D3 # d3 += h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) vpaddq $H2,$D4,$D4 # d4 += h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) vpmuludq $H3,$T1,$T4 # h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) vpmuludq $H4,$T1,$H2 # h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) vpaddq $T4,$D0,$D0 # d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) vpaddq $H2,$D1,$D1 # d1 += h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) vpmuludq $H1,$T2,$T4 # h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) vpmuludq $H0,$T2,$T2 # h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) vpaddq $T4,$D3,$D3 # d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) vpaddq $T2,$D2,$D2 # d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) vpmuludq $H1,$H2,$T4 # h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) vpmuludq $H0,$H2,$H2 # h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) vpaddq $T4,$D4,$D4 # d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) vpaddq $H2,$D3,$D3 # d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) vpmuludq $H3,$T3,$T4 # h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) vpmuludq $H4,$T3,$H2 # h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) vpaddq $T4,$D1,$D1 # d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) vpaddq $H2,$D2,$D2 # d2 += h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) vpmuludq $H3,$S4,$H3 # h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) vpmuludq $H4,$S4,$H4 # h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) vpmuludq $H1,$S4,$H0 # h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) vmovdqa 64(%rcx),$MASK # .Lmask26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) # horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) vpsrldq \$8,$D1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) vpsrldq \$8,$H2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) vpsrldq \$8,$H3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) vpsrldq \$8,$H4,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) vpsrldq \$8,$H0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) vpaddq $T1,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) vpaddq $T2,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) vpaddq $T3,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) vpaddq $T4,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) vpaddq $T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) vpermq \$0x2,$H3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) vpermq \$0x2,$H4,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) vpermq \$0x2,$H0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) vpermq \$0x2,$D1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) vpermq \$0x2,$H2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) vpaddq $T3,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) vpaddq $T4,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) vpaddq $T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) vpaddq $T1,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) vpaddq $T2,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) # lazy reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) vpsrlq \$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) vpand $MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) vpaddq $D3,$H4,$H4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) vpsrlq \$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) vpand $MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) vpaddq $D0,$D1,$H1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) vpsrlq \$26,$H4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) vpand $MASK,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) vpsrlq \$26,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) vpand $MASK,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) vpaddq $D1,$H2,$H2 # h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) vpaddq $D4,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) vpsllq \$2,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) vpaddq $D4,$H0,$H0 # h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) vpsrlq \$26,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) vpand $MASK,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) vpaddq $D2,$H3,$H3 # h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) vpsrlq \$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) vpand $MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) vpaddq $D0,$H1,$H1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) vpsrlq \$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) vpand $MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) vpaddq $D3,$H4,$H4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) vmovd %x#$H1,`4*1-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) vmovd %x#$H2,`4*2-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) vmovd %x#$H3,`4*3-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) vmovd %x#$H4,`4*4-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) $code.=<<___ if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) vmovdqa -0xb0(%r10),%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) vmovdqa -0xa0(%r10),%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) vmovdqa -0x90(%r10),%xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) vmovdqa -0x80(%r10),%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) vmovdqa -0x70(%r10),%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) vmovdqa -0x60(%r10),%xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) vmovdqa -0x50(%r10),%xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) vmovdqa -0x40(%r10),%xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) vmovdqa -0x30(%r10),%xmm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) vmovdqa -0x20(%r10),%xmm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) lea -8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) .Ldo_avx2_epilogue$suffix:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) $code.=<<___ if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) lea -8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) .cfi_def_cfa_register %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) if($avx > 2 && $avx512) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) my $PADBIT="%zmm30";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) map(s/%y/%z/,($MASK));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) .cfi_startproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) .Lblocks_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) mov \$15,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) kmovw %eax,%k2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) $code.=<<___ if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) lea 8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) .cfi_def_cfa_register %r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) sub \$0x128,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) $code.=<<___ if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) lea 8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) sub \$0x1c8,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) vmovdqa %xmm6,-0xb0(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) vmovdqa %xmm7,-0xa0(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) vmovdqa %xmm8,-0x90(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) vmovdqa %xmm9,-0x80(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) vmovdqa %xmm10,-0x70(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) vmovdqa %xmm11,-0x60(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) vmovdqa %xmm12,-0x50(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) vmovdqa %xmm13,-0x40(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) vmovdqa %xmm14,-0x30(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) vmovdqa %xmm15,-0x20(%r10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) .Ldo_avx512_body:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) lea .Lconst(%rip),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) lea 48+64($ctx),$ctx # size optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) # expand pre-calculated table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) and \$-512,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) mov \$0x20,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) vpermd $D0,$T2,$R0 # 00003412 -> 14243444
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) vpbroadcastq 64(%rcx),$MASK # .Lmask26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) vpermd $D1,$T2,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) vpermd $T0,$T2,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) vpermd $D2,$T2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) vpermd $T1,$T2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) vpsrlq \$32,$R1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) vpermd $D3,$T2,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) vmovdqa64 $S1,0x40(%rsp){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) vpermd $T3,$T2,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) vpermd $D4,$T2,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) vpermd $T4,$T2,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) vmovdqa64 $S2,0x80(%rsp){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) vmovdqa64 $S3,0xc0(%rsp){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) vmovdqa64 $S4,0x100(%rsp){%k2}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) # calculate 5th through 8th powers of the key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) vpsrlq \$32,$R2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) vpmuludq $T1,$S4,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) vpmuludq $T1,$R0,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) vpmuludq $T1,$R1,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) vpmuludq $T1,$R2,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) vpmuludq $T1,$R3,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) vpsrlq \$32,$R3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) vpaddq $M1,$D1,$D1 # d1 += r1'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) vpaddq $M2,$D2,$D2 # d2 += r1'*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) vpaddq $M3,$D3,$D3 # d3 += r1'*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) vpaddq $M4,$D4,$D4 # d4 += r1'*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) vpmuludq $T2,$S3,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) vpmuludq $T2,$S4,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) vpmuludq $T2,$R1,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) vpmuludq $T2,$R2,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) vpmuludq $T2,$R0,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) vpsrlq \$32,$R4,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) vpaddq $M3,$D3,$D3 # d3 += r2'*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) vpaddq $M4,$D4,$D4 # d4 += r2'*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) vpaddq $M2,$D2,$D2 # d2 += r2'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) vpmuludq $T3,$S2,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) vpmuludq $T3,$R0,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) vpmuludq $T3,$R1,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) vpmuludq $T3,$S3,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) vpmuludq $T3,$S4,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) vpaddq $M3,$D3,$D3 # d3 += r3'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) vpaddq $M4,$D4,$D4 # d4 += r3'*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) vpmuludq $T4,$S4,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) vpmuludq $T4,$R0,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) vpmuludq $T4,$S1,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) vpmuludq $T4,$S2,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) vpmuludq $T4,$S3,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) vpaddq $M4,$D4,$D4 # d4 += r2'*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) # load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) vmovdqu64 16*0($inp),%z#$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) vmovdqu64 16*4($inp),%z#$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) lea 16*8($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) # lazy reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) vpsrlq \$26,$D3,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) vpandq $MASK,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) vpaddq $M3,$D4,$D4 # d3 -> d4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) vpsrlq \$26,$D0,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) vpandq $MASK,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) vpaddq $M0,$D1,$D1 # d0 -> d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) vpsrlq \$26,$D4,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) vpandq $MASK,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) vpsrlq \$26,$D1,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) vpandq $MASK,$D1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) vpaddq $M1,$D2,$D2 # d1 -> d2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) vpaddq $M4,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) vpsllq \$2,$M4,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) vpaddq $M4,$D0,$D0 # d4 -> d0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) vpsrlq \$26,$D2,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) vpandq $MASK,$D2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) vpaddq $M2,$D3,$D3 # d2 -> d3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) vpsrlq \$26,$D0,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) vpandq $MASK,$D0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) vpaddq $M0,$D1,$D1 # d0 -> d1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) vpsrlq \$26,$D3,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) vpandq $MASK,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) vpaddq $M3,$D4,$D4 # d3 -> d4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) # at this point we have 14243444 in $R0-$S4 and 05060708 in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) # $D0-$D4, ...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) vpunpcklqdq $T4,$T3,$T0 # transpose input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) vpunpckhqdq $T4,$T3,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) # ... since input 64-bit lanes are ordered as 73625140, we could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) # "vperm" it to 76543210 (here and in each loop iteration), *or*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) # we could just flow along, hence the goal for $R0-$S4 is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) # 1858286838784888 ...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) mov \$0x7777,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) kmovw %eax,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) vpermd $R1,$M0,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) vpermd $R2,$M0,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) vpermd $R3,$M0,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) vpermd $R4,$M0,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) vpermd $D1,$M0,${R1}{%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) vpermd $D2,$M0,${R2}{%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) vpermd $D3,$M0,${R3}{%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) vpermd $D4,$M0,${R4}{%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) vpslld \$2,$R1,$S1 # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) vpslld \$2,$R2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) vpslld \$2,$R3,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) vpslld \$2,$R4,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) vpaddd $R1,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) vpaddd $R2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) vpaddd $R3,$S3,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) vpaddd $R4,$S4,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) vpbroadcastq 32(%rcx),$PADBIT # .L129
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) vpsrlq \$52,$T0,$T2 # splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) vpsllq \$12,$T4,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) vporq $T3,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) vpsrlq \$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) vpsrlq \$14,$T4,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) vpsrlq \$40,$T4,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) vpandq $MASK,$T2,$T2 # 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) vpandq $MASK,$T0,$T0 # 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) #vpandq $MASK,$T1,$T1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) #vpandq $MASK,$T3,$T3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) #vporq $PADBIT,$T4,$T4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) vpaddq $H2,$T2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) sub \$192,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) jbe .Ltail_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) jmp .Loop_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) .Loop_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) # \________/\___________/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) #vpaddq $H2,$T2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) # however, as h2 is "chronologically" first one available pull
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) # corresponding operations up, so it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) vpmuludq $H2,$R1,$D3 # d3 = h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) vpaddq $H0,$T0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) vpmuludq $H2,$R2,$D4 # d4 = h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) vpandq $MASK,$T1,$T1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) vpmuludq $H2,$S3,$D0 # d0 = h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) vpandq $MASK,$T3,$T3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) vpmuludq $H2,$S4,$D1 # d1 = h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) vporq $PADBIT,$T4,$T4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) vpmuludq $H2,$R0,$D2 # d2 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) vpaddq $H1,$T1,$H1 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) vpaddq $H3,$T3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) vpaddq $H4,$T4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) vmovdqu64 16*0($inp),$T3 # load input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) vmovdqu64 16*4($inp),$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) lea 16*8($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) vpmuludq $H0,$R3,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) vpmuludq $H0,$R4,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) vpmuludq $H0,$R0,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) vpmuludq $H0,$R1,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) vpaddq $M3,$D3,$D3 # d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) vpaddq $M4,$D4,$D4 # d4 += h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) vpaddq $M0,$D0,$D0 # d0 += h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) vpaddq $M1,$D1,$D1 # d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) vpmuludq $H1,$R2,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) vpmuludq $H1,$R3,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) vpmuludq $H1,$S4,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) vpmuludq $H0,$R2,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) vpaddq $M3,$D3,$D3 # d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) vpaddq $M4,$D4,$D4 # d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) vpaddq $M0,$D0,$D0 # d0 += h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) vpaddq $M2,$D2,$D2 # d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) vpunpcklqdq $T4,$T3,$T0 # transpose input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) vpunpckhqdq $T4,$T3,$T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) vpmuludq $H3,$R0,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) vpmuludq $H3,$R1,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) vpmuludq $H1,$R0,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) vpmuludq $H1,$R1,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) vpaddq $M3,$D3,$D3 # d3 += h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) vpaddq $M4,$D4,$D4 # d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) vpaddq $M1,$D1,$D1 # d1 += h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) vpaddq $M2,$D2,$D2 # d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) vpmuludq $H4,$S4,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) vpmuludq $H4,$R0,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) vpmuludq $H3,$S2,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) vpmuludq $H3,$S3,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) vpaddq $M3,$D3,$D3 # d3 += h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) vpmuludq $H3,$S4,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) vpaddq $M4,$D4,$D4 # d4 += h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) vpaddq $M0,$D0,$D0 # d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) vpaddq $M1,$D1,$D1 # d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) vpaddq $M2,$D2,$D2 # d2 += h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) vpmuludq $H4,$S1,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) vpmuludq $H4,$S2,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) vpmuludq $H4,$S3,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) # lazy reduction (interleaved with input splat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) vpsrlq \$52,$T0,$T2 # splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) vpsllq \$12,$T4,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) vpsrlq \$26,$D3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) vpandq $MASK,$D3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) vpaddq $H3,$D4,$H4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) vporq $T3,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) vpsrlq \$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) vpandq $MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) vpaddq $D0,$H1,$H1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) vpandq $MASK,$T2,$T2 # 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) vpsrlq \$26,$H4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) vpandq $MASK,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) vpsrlq \$26,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) vpandq $MASK,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) vpaddq $D1,$H2,$H2 # h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) vpaddq $D4,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) vpsllq \$2,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) vpaddq $D4,$H0,$H0 # h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) vpaddq $T2,$H2,$H2 # modulo-scheduled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) vpsrlq \$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) vpsrlq \$26,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) vpandq $MASK,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) vpaddq $D2,$D3,$H3 # h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) vpsrlq \$14,$T4,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) vpsrlq \$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) vpandq $MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) vpaddq $D0,$H1,$H1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) vpsrlq \$40,$T4,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) vpsrlq \$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) vpandq $MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) vpaddq $D3,$H4,$H4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) vpandq $MASK,$T0,$T0 # 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) #vpandq $MASK,$T1,$T1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) #vpandq $MASK,$T3,$T3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) #vporq $PADBIT,$T4,$T4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) sub \$128,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) ja .Loop_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) .Ltail_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) # while above multiplications were by r^8 in all lanes, in last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) # iteration we multiply least significant lane by r^8 and most
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) # significant one by r, that's why table gets shifted...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) vpsrlq \$32,$R0,$R0 # 0105020603070408
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) vpsrlq \$32,$R1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) vpsrlq \$32,$R2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) vpsrlq \$32,$S3,$S3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) vpsrlq \$32,$S4,$S4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) vpsrlq \$32,$R3,$R3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) vpsrlq \$32,$R4,$R4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) vpsrlq \$32,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) vpsrlq \$32,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) # load either next or last 64 byte of input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) lea ($inp,$len),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) #vpaddq $H2,$T2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) vpaddq $H0,$T0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) vpmuludq $H2,$R1,$D3 # d3 = h2*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) vpmuludq $H2,$R2,$D4 # d4 = h2*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) vpmuludq $H2,$S3,$D0 # d0 = h2*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) vpandq $MASK,$T1,$T1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) vpmuludq $H2,$S4,$D1 # d1 = h2*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) vpandq $MASK,$T3,$T3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) vpmuludq $H2,$R0,$D2 # d2 = h2*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) vporq $PADBIT,$T4,$T4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) vpaddq $H1,$T1,$H1 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) vpaddq $H3,$T3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) vpaddq $H4,$T4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) vmovdqu 16*0($inp),%x#$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) vpmuludq $H0,$R3,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) vpmuludq $H0,$R4,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) vpmuludq $H0,$R0,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) vpmuludq $H0,$R1,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) vpaddq $M3,$D3,$D3 # d3 += h0*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) vpaddq $M4,$D4,$D4 # d4 += h0*r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) vpaddq $M0,$D0,$D0 # d0 += h0*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) vpaddq $M1,$D1,$D1 # d1 += h0*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) vmovdqu 16*1($inp),%x#$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) vpmuludq $H1,$R2,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) vpmuludq $H1,$R3,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) vpmuludq $H1,$S4,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) vpmuludq $H0,$R2,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) vpaddq $M3,$D3,$D3 # d3 += h1*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) vpaddq $M4,$D4,$D4 # d4 += h1*r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) vpaddq $M0,$D0,$D0 # d0 += h1*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) vpaddq $M2,$D2,$D2 # d2 += h0*r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645) vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) vpmuludq $H3,$R0,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) vpmuludq $H3,$R1,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) vpmuludq $H1,$R0,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) vpmuludq $H1,$R1,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) vpaddq $M3,$D3,$D3 # d3 += h3*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) vpaddq $M4,$D4,$D4 # d4 += h3*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) vpaddq $M1,$D1,$D1 # d1 += h1*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) vpaddq $M2,$D2,$D2 # d2 += h1*r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) vpmuludq $H4,$S4,$M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) vpmuludq $H4,$R0,$M4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) vpmuludq $H3,$S2,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) vpmuludq $H3,$S3,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) vpmuludq $H3,$S4,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) vpaddq $M4,$D4,$D4 # d4 += h4*r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) vpaddq $M0,$D0,$D0 # d0 += h3*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) vpaddq $M1,$D1,$D1 # d1 += h3*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) vpaddq $M2,$D2,$D2 # d2 += h3*s4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) vpmuludq $H4,$S1,$M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) vpmuludq $H4,$S2,$M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) vpmuludq $H4,$S3,$M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) # horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) mov \$1,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) vpermq \$0xb1,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) vpermq \$0xb1,$D4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) vpermq \$0xb1,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) vpermq \$0xb1,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) vpermq \$0xb1,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) vpaddq $D3,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) vpaddq $D4,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) vpaddq $D0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) vpaddq $D1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) vpaddq $D2,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) kmovw %eax,%k3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) vpermq \$0x2,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) vpermq \$0x2,$H4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) vpermq \$0x2,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) vpermq \$0x2,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) vpermq \$0x2,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) vpaddq $D3,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) vpaddq $D4,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) vpaddq $D0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) vpaddq $D1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) vpaddq $D2,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) vextracti64x4 \$0x1,$H3,%y#$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) vextracti64x4 \$0x1,$H4,%y#$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) vextracti64x4 \$0x1,$H0,%y#$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) vextracti64x4 \$0x1,$H1,%y#$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) vextracti64x4 \$0x1,$H2,%y#$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) vpaddq $D0,$H0,${H0}{%k3}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) vpaddq $D1,$H1,${H1}{%k3}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) vpaddq $D2,$H2,${H2}{%k3}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) # lazy reduction (interleaved with input splat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) vpsrlq \$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) vpand $MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) vpsrldq \$6,$T0,$T2 # splat input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) vpsrldq \$6,$T1,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) vpunpckhqdq $T1,$T0,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) vpaddq $D3,$H4,$H4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) vpsrlq \$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) vpand $MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) vpunpcklqdq $T3,$T2,$T2 # 2:3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) vpunpcklqdq $T1,$T0,$T0 # 0:1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) vpaddq $D0,$H1,$H1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) vpsrlq \$26,$H4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) vpand $MASK,$H4,$H4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) vpsrlq \$26,$H1,$D1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) vpand $MASK,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) vpsrlq \$30,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) vpsrlq \$4,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) vpaddq $D1,$H2,$H2 # h1 -> h2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) vpaddq $D4,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) vpsllq \$2,$D4,$D4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) vpsrlq \$26,$T0,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) vpsrlq \$40,$T4,$T4 # 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) vpaddq $D4,$H0,$H0 # h4 -> h0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) vpsrlq \$26,$H2,$D2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) vpand $MASK,$H2,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) vpand $MASK,$T2,$T2 # 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) vpand $MASK,$T0,$T0 # 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) vpaddq $D2,$H3,$H3 # h2 -> h3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) vpsrlq \$26,$H0,$D0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) vpand $MASK,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) vpand $MASK,$T1,$T1 # 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) vpaddq $D0,$H1,$H1 # h0 -> h1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) vpsrlq \$26,$H3,$D3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) vpand $MASK,$H3,$H3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) vpand $MASK,$T3,$T3 # 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) vpor 32(%rcx),$T4,$T4 # padbit, yes, always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) vpaddq $D3,$H4,$H4 # h3 -> h4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) add \$64,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) jnz .Ltail_avx2$suffix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) vpsubq $T2,$H2,$H2 # undo input accumulation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) vmovd %x#$H1,`4*1-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) vmovd %x#$H2,`4*2-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) vmovd %x#$H3,`4*3-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) vmovd %x#$H4,`4*4-48-64`($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) vzeroall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) $code.=<<___ if ($win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) movdqa -0xb0(%r10),%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) movdqa -0xa0(%r10),%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) movdqa -0x90(%r10),%xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) movdqa -0x80(%r10),%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) movdqa -0x70(%r10),%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) movdqa -0x60(%r10),%xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) movdqa -0x50(%r10),%xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) movdqa -0x40(%r10),%xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) movdqa -0x30(%r10),%xmm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) movdqa -0x20(%r10),%xmm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) lea -8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) .Ldo_avx512_epilogue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) $code.=<<___ if (!$win64);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) lea -8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) .cfi_def_cfa_register %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) .cfi_endproc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) &declare_function("poly1305_blocks_avx2", 32, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) poly1305_blocks_avxN(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) &end_function("poly1305_blocks_avx2");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) #######################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) if ($avx>2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) # On entry we have input length divisible by 64. But since inner loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) # processes 128 bytes per iteration, cases when length is not divisible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) # reason stack layout is kept identical to poly1305_blocks_avx2. If not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) # for this tail, we wouldn't have to even allocate stack frame...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) if($kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) $code .= "#ifdef CONFIG_AS_AVX512\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) &declare_function("poly1305_blocks_avx512", 32, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) poly1305_blocks_avxN(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) &end_function("poly1305_blocks_avx512");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) if ($kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) $code .= "#endif\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) if (!$kernel && $avx>3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) # VPMADD52 version using 2^44 radix.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) # One can argue that base 2^52 would be more natural. Well, even though
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) # some operations would be more natural, one has to recognize couple of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) # at amount of multiply-n-accumulate operations. Secondly, it makes it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) # reference implementations], which means that more such operations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) # would have to be performed in inner loop, which in turn makes critical
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) # path longer. In other words, even though base 2^44 reduction might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) # look less elegant, overall critical path is actually shorter...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) # Layout of opaque area is following.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) # unsigned __int64 h[3]; # current hash value base 2^44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) # unsigned __int64 s[2]; # key value*20 base 2^44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) # unsigned __int64 r[3]; # key value base 2^44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) # # r^n positions reflect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) # # placement in register, not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) # # memory, R[3] is R[1]*20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) .type poly1305_init_base2_44,\@function,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) poly1305_init_base2_44:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) xor %eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) mov %rax,0($ctx) # initialize hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) mov %rax,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) mov %rax,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) .Linit_base2_44:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) lea poly1305_blocks_vpmadd52(%rip),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) lea poly1305_emit_base2_44(%rip),%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) mov \$0x0ffffffc0fffffff,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) mov \$0x0ffffffc0ffffffc,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) and 0($inp),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) mov \$0x00000fffffffffff,%r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) and 8($inp),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) mov \$0x00000fffffffffff,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) and %rax,%r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) shrd \$44,%rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) mov %r8,40($ctx) # r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) and %r9,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) shr \$24,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) mov %rax,48($ctx) # r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) lea (%rax,%rax,4),%rax # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) mov %rcx,56($ctx) # r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) shl \$2,%rax # magic <<2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) lea (%rcx,%rcx,4),%rcx # *5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) shl \$2,%rcx # magic <<2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882) mov %rax,24($ctx) # s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) mov %rcx,32($ctx) # s2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) movq \$-1,64($ctx) # write impossible value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) $code.=<<___ if ($flavour !~ /elf32/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) mov %r10,0(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) mov %r11,8(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) $code.=<<___ if ($flavour =~ /elf32/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) mov %r10d,0(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) mov %r11d,4(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) mov \$1,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) .size poly1305_init_base2_44,.-poly1305_init_base2_44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) .type poly1305_blocks_vpmadd52,\@function,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) poly1305_blocks_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) shr \$4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) jz .Lno_data_vpmadd52 # too short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) shl \$40,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) mov 64($ctx),%r8 # peek on power of the key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) # if powers of the key are not calculated yet, process up to 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) # blocks with this single-block subroutine, otherwise ensure that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) # length is divisible by 2 blocks and pass the rest down to next
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) # subroutine...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) mov \$3,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) mov \$1,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) cmp \$4,$len # is input long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) cmovae %r10,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) test %r8,%r8 # is power value impossible?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) cmovns %r10,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) and $len,%rax # is input of favourable length?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) jz .Lblocks_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) sub %rax,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) mov \$7,%r10d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) mov \$1,%r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) kmovw %r10d,%k7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) lea .L2_44_inp_permd(%rip),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) kmovw %r11d,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) vmovq $padbit,%x#$PAD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) vpermq \$0xcf,$PAD,$PAD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) jmp .Loop_vpmadd52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) .Loop_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) vmovdqu32 0($inp),%x#$T0 # load input as ----3210
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) lea 16($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) vpsrlvq $inp_shift,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) vpandq $reduc_mask,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) vporq $PAD,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) vpaddq $T0,$Dlo,$Dlo # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964) vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) vpxord $Dlo,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) vpxord $Dhi,$Dhi,$Dhi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) vpmadd52luq $r2r1r0,$H0,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) vpmadd52huq $r2r1r0,$H0,$Dhi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) vpmadd52luq $r1r0s2,$H1,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) vpmadd52huq $r1r0s2,$H1,$Dhi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) vpmadd52luq $r0s2s1,$H2,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) vpmadd52huq $r0s2s1,$H2,$Dhi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) vpandq $reduc_mask,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) vpaddq $T0,$Dhi,$Dhi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986) vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) vpandq $reduc_mask,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) vpermq \$0b10010011,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) vpaddq $T0,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) vpaddq $T0,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) vpsllq \$2,$T0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) vpaddq $T0,$Dlo,$Dlo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) dec %rax # len-=16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) jnz .Loop_vpmadd52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) test $len,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) jnz .Lblocks_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) .Lno_data_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) # As implied by its name 4x subroutine processes 4 blocks in parallel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) # and is handled in 256-bit %ymm registers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023) my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024) my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) .type poly1305_blocks_vpmadd52_4x,\@function,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) poly1305_blocks_vpmadd52_4x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031) shr \$4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032) jz .Lno_data_vpmadd52_4x # too short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) shl \$40,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) mov 64($ctx),%r8 # peek on power of the key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) .Lblocks_vpmadd52_4x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038) vpbroadcastq $padbit,$PAD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) vmovdqa64 .Lx_mask44(%rip),$mask44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) mov \$5,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042) vmovdqa64 .Lx_mask42(%rip),$mask42
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) kmovw %eax,%k1 # used in 2x path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045) test %r8,%r8 # is power value impossible?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) js .Linit_vpmadd52 # if it is, then init R[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048) vmovq 0($ctx),%x#$H0 # load current hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) vmovq 8($ctx),%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050) vmovq 16($ctx),%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052) test \$3,$len # is length 4*n+2?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) jnz .Lblocks_vpmadd52_2x_do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) .Lblocks_vpmadd52_4x_do:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) vpbroadcastq 64($ctx),$R0 # load 4th power of the key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) vpbroadcastq 96($ctx),$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) vpbroadcastq 128($ctx),$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) vpbroadcastq 160($ctx),$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) .Lblocks_vpmadd52_4x_key_loaded:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) vpsllq \$2,$R2,$S2 # S2 = R2*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063) vpaddq $R2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) vpsllq \$2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) test \$7,$len # is len 8*n?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) jz .Lblocks_vpmadd52_8x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) vmovdqu64 16*0($inp),$T2 # load data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070) vmovdqu64 16*2($inp),$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) lea 16*4($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) vpunpcklqdq $T3,$T2,$T1 # transpose data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) vpunpckhqdq $T3,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) # at this point 64-bit lanes are ordered as 3-1-2-0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) vpsrlq \$24,$T3,$T2 # splat the data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) vporq $PAD,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) vpaddq $T2,$H2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) vpandq $mask44,$T1,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082) vpsrlq \$44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083) vpsllq \$20,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) vporq $T3,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) vpandq $mask44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087) sub \$4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) jz .Ltail_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) jmp .Loop_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090) ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) .Linit_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) vmovq 24($ctx),%x#$S1 # load key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) vmovq 56($ctx),%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) vmovq 32($ctx),%x#$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) vmovq 40($ctx),%x#$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) vmovq 48($ctx),%x#$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) vmovdqa $R0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) vmovdqa $R1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102) vmovdqa $H2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) mov \$2,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) .Lmul_init_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107) vpxorq $D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108) vpmadd52luq $H2,$S1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) vpxorq $D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) vpmadd52huq $H2,$S1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) vpxorq $D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) vpmadd52luq $H2,$S2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) vpxorq $D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114) vpmadd52huq $H2,$S2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) vpxorq $D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) vpmadd52luq $H2,$R0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) vpxorq $D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118) vpmadd52huq $H2,$R0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) vpmadd52luq $H0,$R0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121) vpmadd52huq $H0,$R0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122) vpmadd52luq $H0,$R1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) vpmadd52huq $H0,$R1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124) vpmadd52luq $H0,$R2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125) vpmadd52huq $H0,$R2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) vpmadd52luq $H1,$S2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) vpmadd52huq $H1,$S2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) vpmadd52luq $H1,$R0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130) vpmadd52huq $H1,$R0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131) vpmadd52luq $H1,$R1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) vpmadd52huq $H1,$R1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) # partial reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) vpsrlq \$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) vpsllq \$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138) vpandq $mask44,$D0lo,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139) vpaddq $tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141) vpaddq $D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) vpsrlq \$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144) vpsllq \$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) vpandq $mask44,$D1lo,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146) vpaddq $tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) vpaddq $D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150) vpsrlq \$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) vpsllq \$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) vpandq $mask42,$D2lo,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153) vpaddq $tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) vpaddq $D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) vpsllq \$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) vpaddq $D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) vpsrlq \$44,$H0,$tmp # additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) vpandq $mask44,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) vpaddq $tmp,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) dec %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166) jz .Ldone_init_vpmadd52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) vpunpcklqdq $R1,$H1,$R1 # 1,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169) vpbroadcastq %x#$H1,%x#$H1 # 2,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) vpunpcklqdq $R2,$H2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171) vpbroadcastq %x#$H2,%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) vpunpcklqdq $R0,$H0,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) vpbroadcastq %x#$H0,%x#$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175) vpsllq \$2,$R1,$S1 # S1 = R1*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) vpsllq \$2,$R2,$S2 # S2 = R2*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) vpaddq $R1,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) vpaddq $R2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) vpsllq \$2,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180) vpsllq \$2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182) jmp .Lmul_init_vpmadd52
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) .Ldone_init_vpmadd52:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) vinserti128 \$1,%x#$R2,$H2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) vinserti128 \$1,%x#$R0,$H0,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) vpermq \$0b11011000,$R2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193) vpermq \$0b11011000,$R0,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) vpsllq \$2,$R1,$S1 # S1 = R1*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) vpaddq $R1,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) vpsllq \$2,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) vmovq 0($ctx),%x#$H0 # load current hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) vmovq 8($ctx),%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) vmovq 16($ctx),%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) test \$3,$len # is length 4*n+2?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) jnz .Ldone_init_vpmadd52_2x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) vmovdqu64 $R0,64($ctx) # save key powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) vpbroadcastq %x#$R0,$R0 # broadcast 4th power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) vmovdqu64 $R1,96($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) vpbroadcastq %x#$R1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210) vmovdqu64 $R2,128($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211) vpbroadcastq %x#$R2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) vmovdqu64 $S1,160($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) vpbroadcastq %x#$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) jmp .Lblocks_vpmadd52_4x_key_loaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219) .Ldone_init_vpmadd52_2x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) vmovdqu64 $R0,64($ctx) # save key powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221) vpsrldq \$8,$R0,$R0 # 0-1-0-2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) vmovdqu64 $R1,96($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223) vpsrldq \$8,$R1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) vmovdqu64 $R2,128($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) vpsrldq \$8,$R2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) vmovdqu64 $S1,160($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) vpsrldq \$8,$S1,$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228) jmp .Lblocks_vpmadd52_2x_key_loaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) .Lblocks_vpmadd52_2x_do:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234) vmovdqu64 160+8($ctx),${S1}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) vmovdqu64 64+8($ctx),${R0}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) vmovdqu64 96+8($ctx),${R1}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) .Lblocks_vpmadd52_2x_key_loaded:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) vmovdqu64 16*0($inp),$T2 # load data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) vpxorq $T3,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241) lea 16*2($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243) vpunpcklqdq $T3,$T2,$T1 # transpose data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) vpunpckhqdq $T3,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) # at this point 64-bit lanes are ordered as x-1-x-0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248) vpsrlq \$24,$T3,$T2 # splat the data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) vporq $PAD,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) vpaddq $T2,$H2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) vpandq $mask44,$T1,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) vpsrlq \$44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) vpsllq \$20,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) vporq $T3,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) vpandq $mask44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) jmp .Ltail_vpmadd52_2x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) .Loop_vpmadd52_4x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) #vpaddq $T2,$H2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) vpaddq $T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) vpaddq $T1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266) vpxorq $D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) vpmadd52luq $H2,$S1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268) vpxorq $D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269) vpmadd52huq $H2,$S1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) vpxorq $D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271) vpmadd52luq $H2,$S2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272) vpxorq $D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273) vpmadd52huq $H2,$S2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274) vpxorq $D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275) vpmadd52luq $H2,$R0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) vpxorq $D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) vpmadd52huq $H2,$R0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) vmovdqu64 16*0($inp),$T2 # load data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) vmovdqu64 16*2($inp),$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281) lea 16*4($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) vpmadd52luq $H0,$R0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) vpmadd52huq $H0,$R0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) vpmadd52luq $H0,$R1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) vpmadd52huq $H0,$R1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) vpmadd52luq $H0,$R2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) vpmadd52huq $H0,$R2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) vpunpcklqdq $T3,$T2,$T1 # transpose data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) vpunpckhqdq $T3,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) vpmadd52luq $H1,$S2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292) vpmadd52huq $H1,$S2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) vpmadd52luq $H1,$R0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) vpmadd52huq $H1,$R0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) vpmadd52luq $H1,$R1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) vpmadd52huq $H1,$R1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299) # partial reduction (interleaved with data splat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) vpsrlq \$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) vpsllq \$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302) vpandq $mask44,$D0lo,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) vpaddq $tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) vpsrlq \$24,$T3,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) vporq $PAD,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) vpaddq $D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) vpsrlq \$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) vpsllq \$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) vpandq $mask44,$D1lo,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312) vpaddq $tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314) vpandq $mask44,$T1,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) vpsrlq \$44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316) vpsllq \$20,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) vpaddq $D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) vpsrlq \$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) vpsllq \$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) vpandq $mask42,$D2lo,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322) vpaddq $tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324) vpaddq $T2,$H2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325) vpaddq $D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) vpsllq \$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) vpaddq $D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) vporq $T3,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) vpandq $mask44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332) vpsrlq \$44,$H0,$tmp # additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) vpandq $mask44,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335) vpaddq $tmp,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) sub \$4,$len # len-=64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) jnz .Loop_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340) .Ltail_vpmadd52_4x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) vmovdqu64 128($ctx),$R2 # load all key powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) vmovdqu64 160($ctx),$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343) vmovdqu64 64($ctx),$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) vmovdqu64 96($ctx),$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346) .Ltail_vpmadd52_2x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) vpsllq \$2,$R2,$S2 # S2 = R2*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) vpaddq $R2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) vpsllq \$2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351) #vpaddq $T2,$H2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) vpaddq $T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353) vpaddq $T1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) vpxorq $D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) vpmadd52luq $H2,$S1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) vpxorq $D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358) vpmadd52huq $H2,$S1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) vpxorq $D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360) vpmadd52luq $H2,$S2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) vpxorq $D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) vpmadd52huq $H2,$S2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) vpxorq $D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364) vpmadd52luq $H2,$R0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) vpxorq $D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) vpmadd52huq $H2,$R0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) vpmadd52luq $H0,$R0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) vpmadd52huq $H0,$R0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) vpmadd52luq $H0,$R1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) vpmadd52huq $H0,$R1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372) vpmadd52luq $H0,$R2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) vpmadd52huq $H0,$R2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375) vpmadd52luq $H1,$S2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) vpmadd52huq $H1,$S2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) vpmadd52luq $H1,$R0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) vpmadd52huq $H1,$R0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379) vpmadd52luq $H1,$R1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) vpmadd52huq $H1,$R1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383) # horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385) mov \$1,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) kmovw %eax,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) vpsrldq \$8,$D0lo,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388) vpsrldq \$8,$D0hi,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) vpsrldq \$8,$D1lo,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390) vpsrldq \$8,$D1hi,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391) vpaddq $T0,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) vpaddq $H0,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393) vpsrldq \$8,$D2lo,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) vpsrldq \$8,$D2hi,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) vpaddq $T1,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396) vpaddq $H1,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397) vpermq \$0x2,$D0lo,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) vpermq \$0x2,$D0hi,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) vpaddq $T2,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) vpaddq $H2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) vpermq \$0x2,$D1lo,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) vpermq \$0x2,$D1hi,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) vpermq \$0x2,$D2lo,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) vpermq \$0x2,$D2hi,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411) vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414) # partial reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) vpsrlq \$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) vpsllq \$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417) vpandq $mask44,$D0lo,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) vpaddq $tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3420) vpaddq $D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3421)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3422) vpsrlq \$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3423) vpsllq \$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3424) vpandq $mask44,$D1lo,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3425) vpaddq $tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3427) vpaddq $D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3429) vpsrlq \$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3430) vpsllq \$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3431) vpandq $mask42,$D2lo,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3432) vpaddq $tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3434) vpaddq $D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3435) vpsllq \$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3437) vpaddq $D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3439) vpsrlq \$44,$H0,$tmp # additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3440) vpandq $mask44,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3442) vpaddq $tmp,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3443) # at this point $len is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3444) # either 4*n+2 or 0...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3445) sub \$2,$len # len-=32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3446) ja .Lblocks_vpmadd52_4x_do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3448) vmovq %x#$H0,0($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3449) vmovq %x#$H1,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3450) vmovq %x#$H2,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3451) vzeroall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3453) .Lno_data_vpmadd52_4x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3454) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3455) .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3456) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3457) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3458) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3459) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3460) # As implied by its name 8x subroutine processes 8 blocks in parallel...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3461) # This is intermediate version, as it's used only in cases when input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3462) # length is either 8*n, 8*n+1 or 8*n+2...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3464) my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3465) my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3466) my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3467) my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3469) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3470) .type poly1305_blocks_vpmadd52_8x,\@function,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3471) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3472) poly1305_blocks_vpmadd52_8x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3473) shr \$4,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3474) jz .Lno_data_vpmadd52_8x # too short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3475)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3476) shl \$40,$padbit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3477) mov 64($ctx),%r8 # peek on power of the key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3479) vmovdqa64 .Lx_mask44(%rip),$mask44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3480) vmovdqa64 .Lx_mask42(%rip),$mask42
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3482) test %r8,%r8 # is power value impossible?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3483) js .Linit_vpmadd52 # if it is, then init R[4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3485) vmovq 0($ctx),%x#$H0 # load current hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3486) vmovq 8($ctx),%x#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3487) vmovq 16($ctx),%x#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3489) .Lblocks_vpmadd52_8x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3490) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3491) # fist we calculate more key powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3493) vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3494) vmovdqu64 160($ctx),$S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3495) vmovdqu64 64($ctx),$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3496) vmovdqu64 96($ctx),$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3497)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3498) vpsllq \$2,$R2,$S2 # S2 = R2*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3499) vpaddq $R2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3500) vpsllq \$2,$S2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3502) vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3503) vpbroadcastq %x#$R0,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3504) vpbroadcastq %x#$R1,$RR1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3506) vpxorq $D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3507) vpmadd52luq $RR2,$S1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3508) vpxorq $D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3509) vpmadd52huq $RR2,$S1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3510) vpxorq $D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3511) vpmadd52luq $RR2,$S2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3512) vpxorq $D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3513) vpmadd52huq $RR2,$S2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3514) vpxorq $D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3515) vpmadd52luq $RR2,$R0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3516) vpxorq $D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3517) vpmadd52huq $RR2,$R0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3518)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3519) vpmadd52luq $RR0,$R0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3520) vpmadd52huq $RR0,$R0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3521) vpmadd52luq $RR0,$R1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3522) vpmadd52huq $RR0,$R1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3523) vpmadd52luq $RR0,$R2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3524) vpmadd52huq $RR0,$R2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3526) vpmadd52luq $RR1,$S2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3527) vpmadd52huq $RR1,$S2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3528) vpmadd52luq $RR1,$R0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3529) vpmadd52huq $RR1,$R0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3530) vpmadd52luq $RR1,$R1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3531) vpmadd52huq $RR1,$R1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3533) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3534) # partial reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3535) vpsrlq \$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3536) vpsllq \$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3537) vpandq $mask44,$D0lo,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3538) vpaddq $tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3540) vpaddq $D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3542) vpsrlq \$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3543) vpsllq \$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3544) vpandq $mask44,$D1lo,$RR1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3545) vpaddq $tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3546)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3547) vpaddq $D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3549) vpsrlq \$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3550) vpsllq \$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3551) vpandq $mask42,$D2lo,$RR2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3552) vpaddq $tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3554) vpaddq $D2hi,$RR0,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3555) vpsllq \$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3556)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3557) vpaddq $D2hi,$RR0,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3558)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3559) vpsrlq \$44,$RR0,$tmp # additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3560) vpandq $mask44,$RR0,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3562) vpaddq $tmp,$RR1,$RR1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3564) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3565) # At this point Rx holds 1324 powers, RRx - 5768, and the goal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3566) # is 15263748, which reflects how data is loaded...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3568) vpunpcklqdq $R2,$RR2,$T2 # 3748
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3569) vpunpckhqdq $R2,$RR2,$R2 # 1526
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3570) vpunpcklqdq $R0,$RR0,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3571) vpunpckhqdq $R0,$RR0,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3572) vpunpcklqdq $R1,$RR1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3573) vpunpckhqdq $R1,$RR1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3574) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3575) ######## switch to %zmm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3576) map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3577) map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3578) map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3579) map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3581) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3582) vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3583) vshufi64x2 \$0x44,$R0,$T0,$RR0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3584) vshufi64x2 \$0x44,$R1,$T1,$RR1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3586) vmovdqu64 16*0($inp),$T2 # load data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3587) vmovdqu64 16*4($inp),$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3588) lea 16*8($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3590) vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3591) vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3592) vpaddq $RR2,$SS2,$SS2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3593) vpaddq $RR1,$SS1,$SS1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3594) vpsllq \$2,$SS2,$SS2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3595) vpsllq \$2,$SS1,$SS1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3597) vpbroadcastq $padbit,$PAD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3598) vpbroadcastq %x#$mask44,$mask44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3599) vpbroadcastq %x#$mask42,$mask42
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3601) vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3602) vpbroadcastq %x#$SS2,$S2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3603) vpbroadcastq %x#$RR0,$R0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3604) vpbroadcastq %x#$RR1,$R1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3605) vpbroadcastq %x#$RR2,$R2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3607) vpunpcklqdq $T3,$T2,$T1 # transpose data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3608) vpunpckhqdq $T3,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3610) # at this point 64-bit lanes are ordered as 73625140
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3612) vpsrlq \$24,$T3,$T2 # splat the data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3613) vporq $PAD,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3614) vpaddq $T2,$H2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3615) vpandq $mask44,$T1,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3616) vpsrlq \$44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3617) vpsllq \$20,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3618) vporq $T3,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3619) vpandq $mask44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3621) sub \$8,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3622) jz .Ltail_vpmadd52_8x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3623) jmp .Loop_vpmadd52_8x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3625) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3626) .Loop_vpmadd52_8x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3627) #vpaddq $T2,$H2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3628) vpaddq $T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3629) vpaddq $T1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3631) vpxorq $D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3632) vpmadd52luq $H2,$S1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3633) vpxorq $D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3634) vpmadd52huq $H2,$S1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3635) vpxorq $D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3636) vpmadd52luq $H2,$S2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3637) vpxorq $D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3638) vpmadd52huq $H2,$S2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3639) vpxorq $D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3640) vpmadd52luq $H2,$R0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3641) vpxorq $D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3642) vpmadd52huq $H2,$R0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3643)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3644) vmovdqu64 16*0($inp),$T2 # load data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3645) vmovdqu64 16*4($inp),$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3646) lea 16*8($inp),$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3647) vpmadd52luq $H0,$R0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3648) vpmadd52huq $H0,$R0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3649) vpmadd52luq $H0,$R1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3650) vpmadd52huq $H0,$R1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3651) vpmadd52luq $H0,$R2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3652) vpmadd52huq $H0,$R2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3653)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3654) vpunpcklqdq $T3,$T2,$T1 # transpose data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3655) vpunpckhqdq $T3,$T2,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3656) vpmadd52luq $H1,$S2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3657) vpmadd52huq $H1,$S2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3658) vpmadd52luq $H1,$R0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3659) vpmadd52huq $H1,$R0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3660) vpmadd52luq $H1,$R1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3661) vpmadd52huq $H1,$R1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3663) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3664) # partial reduction (interleaved with data splat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3665) vpsrlq \$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3666) vpsllq \$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3667) vpandq $mask44,$D0lo,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3668) vpaddq $tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3670) vpsrlq \$24,$T3,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3671) vporq $PAD,$T2,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3672) vpaddq $D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3674) vpsrlq \$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3675) vpsllq \$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3676) vpandq $mask44,$D1lo,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3677) vpaddq $tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3679) vpandq $mask44,$T1,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3680) vpsrlq \$44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3681) vpsllq \$20,$T3,$T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3682) vpaddq $D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3684) vpsrlq \$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3685) vpsllq \$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3686) vpandq $mask42,$D2lo,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3687) vpaddq $tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3689) vpaddq $T2,$H2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3690) vpaddq $D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3691) vpsllq \$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3693) vpaddq $D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3694) vporq $T3,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3695) vpandq $mask44,$T1,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3696)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3697) vpsrlq \$44,$H0,$tmp # additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3698) vpandq $mask44,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3700) vpaddq $tmp,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3702) sub \$8,$len # len-=128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3703) jnz .Loop_vpmadd52_8x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3705) .Ltail_vpmadd52_8x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3706) #vpaddq $T2,$H2,$H2 # accumulate input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3707) vpaddq $T0,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3708) vpaddq $T1,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3709)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3710) vpxorq $D0lo,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3711) vpmadd52luq $H2,$SS1,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3712) vpxorq $D0hi,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3713) vpmadd52huq $H2,$SS1,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3714) vpxorq $D1lo,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3715) vpmadd52luq $H2,$SS2,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3716) vpxorq $D1hi,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3717) vpmadd52huq $H2,$SS2,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3718) vpxorq $D2lo,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3719) vpmadd52luq $H2,$RR0,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3720) vpxorq $D2hi,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3721) vpmadd52huq $H2,$RR0,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3723) vpmadd52luq $H0,$RR0,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3724) vpmadd52huq $H0,$RR0,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3725) vpmadd52luq $H0,$RR1,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3726) vpmadd52huq $H0,$RR1,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3727) vpmadd52luq $H0,$RR2,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3728) vpmadd52huq $H0,$RR2,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3729)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3730) vpmadd52luq $H1,$SS2,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3731) vpmadd52huq $H1,$SS2,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3732) vpmadd52luq $H1,$RR0,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3733) vpmadd52huq $H1,$RR0,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3734) vpmadd52luq $H1,$RR1,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3735) vpmadd52huq $H1,$RR1,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3737) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3738) # horizontal addition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3740) mov \$1,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3741) kmovw %eax,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3742) vpsrldq \$8,$D0lo,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3743) vpsrldq \$8,$D0hi,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3744) vpsrldq \$8,$D1lo,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3745) vpsrldq \$8,$D1hi,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3746) vpaddq $T0,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3747) vpaddq $H0,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3748) vpsrldq \$8,$D2lo,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3749) vpsrldq \$8,$D2hi,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3750) vpaddq $T1,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3751) vpaddq $H1,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3752) vpermq \$0x2,$D0lo,$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3753) vpermq \$0x2,$D0hi,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3754) vpaddq $T2,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3755) vpaddq $H2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3756)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3757) vpermq \$0x2,$D1lo,$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3758) vpermq \$0x2,$D1hi,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3759) vpaddq $T0,$D0lo,$D0lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3760) vpaddq $H0,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3761) vpermq \$0x2,$D2lo,$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3762) vpermq \$0x2,$D2hi,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3763) vpaddq $T1,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3764) vpaddq $H1,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3765) vextracti64x4 \$1,$D0lo,%y#$T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3766) vextracti64x4 \$1,$D0hi,%y#$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3767) vpaddq $T2,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3768) vpaddq $H2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3769)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3770) vextracti64x4 \$1,$D1lo,%y#$T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3771) vextracti64x4 \$1,$D1hi,%y#$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3772) vextracti64x4 \$1,$D2lo,%y#$T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3773) vextracti64x4 \$1,$D2hi,%y#$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3774) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3775) ######## switch back to %ymm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3776) map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3777) map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3778) map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3780) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3781) vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3782) vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3783) vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3784) vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3785) vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3786) vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3788) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3789) # partial reduction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3790) vpsrlq \$44,$D0lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3791) vpsllq \$8,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3792) vpandq $mask44,$D0lo,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3793) vpaddq $tmp,$D0hi,$D0hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3794)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3795) vpaddq $D0hi,$D1lo,$D1lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3797) vpsrlq \$44,$D1lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3798) vpsllq \$8,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3799) vpandq $mask44,$D1lo,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3800) vpaddq $tmp,$D1hi,$D1hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3802) vpaddq $D1hi,$D2lo,$D2lo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3804) vpsrlq \$42,$D2lo,$tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3805) vpsllq \$10,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3806) vpandq $mask42,$D2lo,$H2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3807) vpaddq $tmp,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3809) vpaddq $D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3810) vpsllq \$2,$D2hi,$D2hi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3812) vpaddq $D2hi,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3814) vpsrlq \$44,$H0,$tmp # additional step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3815) vpandq $mask44,$H0,$H0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3817) vpaddq $tmp,$H1,$H1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3819) ################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3820)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3821) vmovq %x#$H0,0($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3822) vmovq %x#$H1,8($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3823) vmovq %x#$H2,16($ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3824) vzeroall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3826) .Lno_data_vpmadd52_8x:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3827) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3828) .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3829) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3830) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3831) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3832) .type poly1305_emit_base2_44,\@function,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3833) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3834) poly1305_emit_base2_44:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3835) mov 0($ctx),%r8 # load hash value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3836) mov 8($ctx),%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3837) mov 16($ctx),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3839) mov %r9,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3840) shr \$20,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3841) shl \$44,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3842) mov %r10,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3843) shr \$40,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3844) shl \$24,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3846) add %rax,%r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3847) adc %rcx,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3848) adc \$0,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3850) mov %r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3851) add \$5,%r8 # compare to modulus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3852) mov %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3853) adc \$0,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3854) adc \$0,%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3855) shr \$2,%r10 # did 130-bit value overflow?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3856) cmovnz %r8,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3857) cmovnz %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3858)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3859) add 0($nonce),%rax # accumulate nonce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3860) adc 8($nonce),%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3861) mov %rax,0($mac) # write result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3862) mov %rcx,8($mac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3863)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3864) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3865) .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3866) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3867) } } }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3868) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3870) if (!$kernel)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3871) { # chacha20-poly1305 helpers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3872) my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3873) ("%rdi","%rsi","%rdx","%rcx"); # Unix order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3874) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3875) .globl xor128_encrypt_n_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3876) .type xor128_encrypt_n_pad,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3877) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3878) xor128_encrypt_n_pad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3879) sub $otp,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3880) sub $otp,$out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3881) mov $len,%r10 # put len aside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3882) shr \$4,$len # len / 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3883) jz .Ltail_enc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3884) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3885) .Loop_enc_xmm:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3886) movdqu ($inp,$otp),%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3887) pxor ($otp),%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3888) movdqu %xmm0,($out,$otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3889) movdqa %xmm0,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3890) lea 16($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3891) dec $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3892) jnz .Loop_enc_xmm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3893)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3894) and \$15,%r10 # len % 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3895) jz .Ldone_enc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3896)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3897) .Ltail_enc:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3898) mov \$16,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3899) sub %r10,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3900) xor %eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3901) .Loop_enc_byte:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3902) mov ($inp,$otp),%al
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3903) xor ($otp),%al
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3904) mov %al,($out,$otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3905) mov %al,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3906) lea 1($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3907) dec %r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3908) jnz .Loop_enc_byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3910) xor %eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3911) .Loop_enc_pad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3912) mov %al,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3913) lea 1($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3914) dec $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3915) jnz .Loop_enc_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3917) .Ldone_enc:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3918) mov $otp,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3919) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3920) .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3922) .globl xor128_decrypt_n_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3923) .type xor128_decrypt_n_pad,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3924) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3925) xor128_decrypt_n_pad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3926) sub $otp,$inp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3927) sub $otp,$out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3928) mov $len,%r10 # put len aside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3929) shr \$4,$len # len / 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3930) jz .Ltail_dec
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3931) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3932) .Loop_dec_xmm:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3933) movdqu ($inp,$otp),%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3934) movdqa ($otp),%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3935) pxor %xmm0,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3936) movdqu %xmm1,($out,$otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3937) movdqa %xmm0,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3938) lea 16($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3939) dec $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3940) jnz .Loop_dec_xmm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3941)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3942) pxor %xmm1,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3943) and \$15,%r10 # len % 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3944) jz .Ldone_dec
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3945)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3946) .Ltail_dec:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3947) mov \$16,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3948) sub %r10,$len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3949) xor %eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3950) xor %r11d,%r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3951) .Loop_dec_byte:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3952) mov ($inp,$otp),%r11b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3953) mov ($otp),%al
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3954) xor %r11b,%al
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3955) mov %al,($out,$otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3956) mov %r11b,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3957) lea 1($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3958) dec %r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3959) jnz .Loop_dec_byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3960)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3961) xor %eax,%eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3962) .Loop_dec_pad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3963) mov %al,($otp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3964) lea 1($otp),$otp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3965) dec $len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3966) jnz .Loop_dec_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3967)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3968) .Ldone_dec:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3969) mov $otp,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3970) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3971) .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3972) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3973) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3975) # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3976) # CONTEXT *context,DISPATCHER_CONTEXT *disp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3977) if ($win64) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3978) $rec="%rcx";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3979) $frame="%rdx";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3980) $context="%r8";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3981) $disp="%r9";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3982)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3983) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3984) .extern __imp_RtlVirtualUnwind
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3985) .type se_handler,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3986) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3987) se_handler:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3988) push %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3989) push %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3990) push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3991) push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3992) push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3993) push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3994) push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3995) push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3996) pushfq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3997) sub \$64,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3998)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3999) mov 120($context),%rax # pull context->Rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4000) mov 248($context),%rbx # pull context->Rip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4002) mov 8($disp),%rsi # disp->ImageBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4003) mov 56($disp),%r11 # disp->HandlerData
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4005) mov 0(%r11),%r10d # HandlerData[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4006) lea (%rsi,%r10),%r10 # prologue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4007) cmp %r10,%rbx # context->Rip<.Lprologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4008) jb .Lcommon_seh_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4009)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4010) mov 152($context),%rax # pull context->Rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4011)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4012) mov 4(%r11),%r10d # HandlerData[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4013) lea (%rsi,%r10),%r10 # epilogue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4014) cmp %r10,%rbx # context->Rip>=.Lepilogue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4015) jae .Lcommon_seh_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4016)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4017) lea 48(%rax),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4018)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4019) mov -8(%rax),%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4020) mov -16(%rax),%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4021) mov -24(%rax),%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4022) mov -32(%rax),%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4023) mov -40(%rax),%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4024) mov -48(%rax),%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4025) mov %rbx,144($context) # restore context->Rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4026) mov %rbp,160($context) # restore context->Rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4027) mov %r12,216($context) # restore context->R12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4028) mov %r13,224($context) # restore context->R13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4029) mov %r14,232($context) # restore context->R14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4030) mov %r15,240($context) # restore context->R14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4032) jmp .Lcommon_seh_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4033) .size se_handler,.-se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4034)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4035) .type avx_handler,\@abi-omnipotent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4036) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4037) avx_handler:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4038) push %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4039) push %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4040) push %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4041) push %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4042) push %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4043) push %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4044) push %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4045) push %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4046) pushfq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4047) sub \$64,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4049) mov 120($context),%rax # pull context->Rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4050) mov 248($context),%rbx # pull context->Rip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4051)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4052) mov 8($disp),%rsi # disp->ImageBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4053) mov 56($disp),%r11 # disp->HandlerData
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4054)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4055) mov 0(%r11),%r10d # HandlerData[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4056) lea (%rsi,%r10),%r10 # prologue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4057) cmp %r10,%rbx # context->Rip<prologue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4058) jb .Lcommon_seh_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4059)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4060) mov 152($context),%rax # pull context->Rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4062) mov 4(%r11),%r10d # HandlerData[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4063) lea (%rsi,%r10),%r10 # epilogue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4064) cmp %r10,%rbx # context->Rip>=epilogue label
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4065) jae .Lcommon_seh_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4066)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4067) mov 208($context),%rax # pull context->R11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4069) lea 0x50(%rax),%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4070) lea 0xf8(%rax),%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4071) lea 512($context),%rdi # &context.Xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4072) mov \$20,%ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4073) .long 0xa548f3fc # cld; rep movsq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4074)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4075) .Lcommon_seh_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4076) mov 8(%rax),%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4077) mov 16(%rax),%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4078) mov %rax,152($context) # restore context->Rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4079) mov %rsi,168($context) # restore context->Rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4080) mov %rdi,176($context) # restore context->Rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4082) mov 40($disp),%rdi # disp->ContextRecord
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4083) mov $context,%rsi # context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4084) mov \$154,%ecx # sizeof(CONTEXT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4085) .long 0xa548f3fc # cld; rep movsq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4087) mov $disp,%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4088) xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4089) mov 8(%rsi),%rdx # arg2, disp->ImageBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4090) mov 0(%rsi),%r8 # arg3, disp->ControlPc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4091) mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4092) mov 40(%rsi),%r10 # disp->ContextRecord
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4093) lea 56(%rsi),%r11 # &disp->HandlerData
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4094) lea 24(%rsi),%r12 # &disp->EstablisherFrame
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4095) mov %r10,32(%rsp) # arg5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4096) mov %r11,40(%rsp) # arg6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4097) mov %r12,48(%rsp) # arg7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4098) mov %rcx,56(%rsp) # arg8, (NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4099) call *__imp_RtlVirtualUnwind(%rip)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4101) mov \$1,%eax # ExceptionContinueSearch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4102) add \$64,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4103) popfq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4104) pop %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4105) pop %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4106) pop %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4107) pop %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4108) pop %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4109) pop %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4110) pop %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4111) pop %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4112) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4113) .size avx_handler,.-avx_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4115) .section .pdata
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4116) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4117) .rva .LSEH_begin_poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4118) .rva .LSEH_end_poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4119) .rva .LSEH_info_poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4121) .rva .LSEH_begin_poly1305_blocks_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4122) .rva .LSEH_end_poly1305_blocks_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4123) .rva .LSEH_info_poly1305_blocks_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4125) .rva .LSEH_begin_poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4126) .rva .LSEH_end_poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4127) .rva .LSEH_info_poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4128) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4129) $code.=<<___ if ($avx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4130) .rva .LSEH_begin_poly1305_blocks_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4131) .rva .Lbase2_64_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4132) .rva .LSEH_info_poly1305_blocks_avx_1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4134) .rva .Lbase2_64_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4135) .rva .Leven_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4136) .rva .LSEH_info_poly1305_blocks_avx_2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4138) .rva .Leven_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4139) .rva .LSEH_end_poly1305_blocks_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4140) .rva .LSEH_info_poly1305_blocks_avx_3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4142) .rva .LSEH_begin_poly1305_emit_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4143) .rva .LSEH_end_poly1305_emit_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4144) .rva .LSEH_info_poly1305_emit_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4145) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4146) $code.=<<___ if ($avx>1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4147) .rva .LSEH_begin_poly1305_blocks_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4148) .rva .Lbase2_64_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4149) .rva .LSEH_info_poly1305_blocks_avx2_1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4151) .rva .Lbase2_64_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4152) .rva .Leven_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4153) .rva .LSEH_info_poly1305_blocks_avx2_2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4155) .rva .Leven_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4156) .rva .LSEH_end_poly1305_blocks_avx2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4157) .rva .LSEH_info_poly1305_blocks_avx2_3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4158) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4159) $code.=<<___ if ($avx>2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4160) .rva .LSEH_begin_poly1305_blocks_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4161) .rva .LSEH_end_poly1305_blocks_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4162) .rva .LSEH_info_poly1305_blocks_avx512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4163) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4164) $code.=<<___;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4165) .section .xdata
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4166) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4167) .LSEH_info_poly1305_init_x86_64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4168) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4169) .rva se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4170) .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4172) .LSEH_info_poly1305_blocks_x86_64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4173) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4174) .rva se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4175) .rva .Lblocks_body,.Lblocks_epilogue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4177) .LSEH_info_poly1305_emit_x86_64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4178) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4179) .rva se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4180) .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4181) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4182) $code.=<<___ if ($avx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4183) .LSEH_info_poly1305_blocks_avx_1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4184) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4185) .rva se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4186) .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4187)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4188) .LSEH_info_poly1305_blocks_avx_2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4189) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4190) .rva se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4191) .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4193) .LSEH_info_poly1305_blocks_avx_3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4194) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4195) .rva avx_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4196) .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4198) .LSEH_info_poly1305_emit_avx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4199) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4200) .rva se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4201) .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4202) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4203) $code.=<<___ if ($avx>1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4204) .LSEH_info_poly1305_blocks_avx2_1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4205) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4206) .rva se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4207) .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4209) .LSEH_info_poly1305_blocks_avx2_2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4210) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4211) .rva se_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4212) .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4214) .LSEH_info_poly1305_blocks_avx2_3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4215) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4216) .rva avx_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4217) .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4218) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4219) $code.=<<___ if ($avx>2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4220) .LSEH_info_poly1305_blocks_avx512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4221) .byte 9,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4222) .rva avx_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4223) .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4224) ___
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4225) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4227) open SELF,$0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4228) while(<SELF>) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4229) next if (/^#!/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4230) last if (!s/^#/\/\// and !/^$/);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4231) print;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4232) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4233) close SELF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4235) foreach (split('\n',$code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4236) s/\`([^\`]*)\`/eval($1)/ge;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4237) s/%r([a-z]+)#d/%e$1/g;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4238) s/%r([0-9]+)#d/%r$1d/g;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4239) s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4241) if ($kernel) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4242) s/(^\.type.*),[0-9]+$/\1/;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4243) s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4244) next if /^\.cfi.*/;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4245) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4247) print $_,"\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4248) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4249) close STDOUT;