Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) // SPDX-License-Identifier: GPL-2.0 OR MIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3)  * Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4)  * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6)  * This is a machine-generated formally verified implementation of Curve25519
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7)  * ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8)  * generated, it has been tweaked to be suitable for use in the kernel. It is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9)  * optimized for 64-bit machines that can efficiently work with 128-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10)  * integer types.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13) #include <asm/unaligned.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14) #include <crypto/curve25519.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15) #include <linux/string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17) typedef __uint128_t u128;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) static __always_inline u64 u64_eq_mask(u64 a, u64 b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) 	u64 x = a ^ b;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) 	u64 minus_x = ~x + (u64)1U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) 	u64 x_or_minus_x = x | minus_x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) 	u64 xnx = x_or_minus_x >> (u32)63U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) 	u64 c = xnx - (u64)1U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) 	return c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) static __always_inline u64 u64_gte_mask(u64 a, u64 b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) 	u64 x = a;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) 	u64 y = b;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) 	u64 x_xor_y = x ^ y;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) 	u64 x_sub_y = x - y;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) 	u64 x_sub_y_xor_y = x_sub_y ^ y;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) 	u64 q = x_xor_y | x_sub_y_xor_y;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) 	u64 x_xor_q = x ^ q;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) 	u64 x_xor_q_ = x_xor_q >> (u32)63U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) 	u64 c = x_xor_q_ - (u64)1U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) 	return c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) static __always_inline void modulo_carry_top(u64 *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) 	u64 b4 = b[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) 	u64 b0 = b[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) 	u64 b4_ = b4 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) 	u64 b0_ = b0 + 19 * (b4 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) 	b[4] = b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) 	b[0] = b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) 		u128 xi = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) 		output[0] = ((u64)(xi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) 		u128 xi = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) 		output[1] = ((u64)(xi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) 		u128 xi = input[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) 		output[2] = ((u64)(xi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68) 		u128 xi = input[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69) 		output[3] = ((u64)(xi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) 		u128 xi = input[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) 		output[4] = ((u64)(xi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) static __always_inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) 	output[0] += (u128)input[0] * s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) 	output[1] += (u128)input[1] * s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) 	output[2] += (u128)input[2] * s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) 	output[3] += (u128)input[3] * s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) 	output[4] += (u128)input[4] * s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) static __always_inline void fproduct_carry_wide_(u128 *tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) 		u32 ctr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) 		u128 tctr = tmp[ctr];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) 		u128 tctrp1 = tmp[ctr + 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) 		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) 		u128 c = ((tctr) >> (51));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) 		tmp[ctr] = ((u128)(r0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) 		tmp[ctr + 1] = ((tctrp1) + (c));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) 		u32 ctr = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) 		u128 tctr = tmp[ctr];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) 		u128 tctrp1 = tmp[ctr + 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) 		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) 		u128 c = ((tctr) >> (51));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) 		tmp[ctr] = ((u128)(r0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) 		tmp[ctr + 1] = ((tctrp1) + (c));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) 		u32 ctr = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) 		u128 tctr = tmp[ctr];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) 		u128 tctrp1 = tmp[ctr + 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) 		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 		u128 c = ((tctr) >> (51));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) 		tmp[ctr] = ((u128)(r0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) 		tmp[ctr + 1] = ((tctrp1) + (c));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) 		u32 ctr = 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) 		u128 tctr = tmp[ctr];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) 		u128 tctrp1 = tmp[ctr + 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) 		u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) 		u128 c = ((tctr) >> (51));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) 		tmp[ctr] = ((u128)(r0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) 		tmp[ctr + 1] = ((tctrp1) + (c));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) static __always_inline void fmul_shift_reduce(u64 *output)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) 	u64 tmp = output[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) 	u64 b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) 		u32 ctr = 5 - 0 - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) 		u64 z = output[ctr - 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) 		output[ctr] = z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) 		u32 ctr = 5 - 1 - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) 		u64 z = output[ctr - 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) 		output[ctr] = z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) 		u32 ctr = 5 - 2 - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) 		u64 z = output[ctr - 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) 		output[ctr] = z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 		u32 ctr = 5 - 3 - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 		u64 z = output[ctr - 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 		output[ctr] = z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 	output[0] = tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) 	b0 = output[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 	output[0] = 19 * b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) 						   u64 *input21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) 	u32 i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) 	u64 input2i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 		u64 input2i = input21[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) 		fproduct_sum_scalar_multiplication_(output, input, input2i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) 		fmul_shift_reduce(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 		u64 input2i = input21[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) 		fproduct_sum_scalar_multiplication_(output, input, input2i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) 		fmul_shift_reduce(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) 		u64 input2i = input21[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) 		fproduct_sum_scalar_multiplication_(output, input, input2i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 		fmul_shift_reduce(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) 		u64 input2i = input21[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 		fproduct_sum_scalar_multiplication_(output, input, input2i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) 		fmul_shift_reduce(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 	i = 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 	input2i = input21[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) 	fproduct_sum_scalar_multiplication_(output, input, input2i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) 	u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) 		u128 b4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 		u128 b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 		u128 b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 		u128 b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 		u64 i0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 		u64 i1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 		u64 i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) 		u64 i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 		u128 t[5] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) 		fmul_mul_shift_reduce_(t, tmp, input21);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 		fproduct_carry_wide_(t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) 		b4 = t[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) 		b0 = t[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) 		b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) 		b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 		t[4] = b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) 		t[0] = b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 		fproduct_copy_from_wide_(output, t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 		i0 = output[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) 		i1 = output[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) 		i0_ = i0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) 		i1_ = i1 + (i0 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) 		output[0] = i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) 		output[1] = i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) 	u64 r0 = output[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) 	u64 r1 = output[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) 	u64 r2 = output[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) 	u64 r3 = output[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) 	u64 r4 = output[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) 	u64 d0 = r0 * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) 	u64 d1 = r1 * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) 	u64 d2 = r2 * 2 * 19;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) 	u64 d419 = r4 * 19;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) 	u64 d4 = d419 * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) 	u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) 		   (((u128)(d2) * (r3))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 	u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) 		   (((u128)(r3 * 19) * (r3))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) 	u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) 		   (((u128)(d4) * (r3))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) 	u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) 		   (((u128)(r4) * (d419))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) 	u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) 		   (((u128)(r2) * (r2))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) 	tmp[0] = s0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) 	tmp[1] = s1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) 	tmp[2] = s2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) 	tmp[3] = s3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) 	tmp[4] = s4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) 	u128 b4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) 	u128 b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) 	u128 b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) 	u128 b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) 	u64 i0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) 	u64 i1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) 	u64 i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) 	u64 i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) 	fsquare_fsquare__(tmp, output);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) 	fproduct_carry_wide_(tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) 	b4 = tmp[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) 	b0 = tmp[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) 	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) 	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) 	tmp[4] = b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) 	tmp[0] = b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) 	fproduct_copy_from_wide_(output, tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) 	i0 = output[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) 	i1 = output[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) 	i0_ = i0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) 	i1_ = i1 + (i0 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) 	output[0] = i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) 	output[1] = i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) 						   u32 count1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) 	u32 i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) 	fsquare_fsquare_(tmp, output);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) 	for (i = 1; i < count1; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) 		fsquare_fsquare_(tmp, output);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) 						  u32 count1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 	u128 t[5];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) 	memcpy(output, input, 5 * sizeof(*input));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) 	fsquare_fsquare_times_(output, t, count1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 							  u32 count1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 	u128 t[5];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) 	fsquare_fsquare_times_(output, t, count1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) static __always_inline void crecip_crecip(u64 *out, u64 *z)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) 	u64 buf[20] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) 	u64 *a0 = buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) 	u64 *t00 = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) 	u64 *b0 = buf + 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) 	u64 *t01;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) 	u64 *b1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) 	u64 *c0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 	u64 *a;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) 	u64 *t0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) 	u64 *b;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) 	u64 *c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) 	fsquare_fsquare_times(a0, z, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) 	fsquare_fsquare_times(t00, a0, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) 	fmul_fmul(b0, t00, z);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) 	fmul_fmul(a0, b0, a0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) 	fsquare_fsquare_times(t00, a0, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) 	fmul_fmul(b0, t00, b0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) 	fsquare_fsquare_times(t00, b0, 5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) 	t01 = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) 	b1 = buf + 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) 	c0 = buf + 15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) 	fmul_fmul(b1, t01, b1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) 	fsquare_fsquare_times(t01, b1, 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) 	fmul_fmul(c0, t01, b1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) 	fsquare_fsquare_times(t01, c0, 20);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) 	fmul_fmul(t01, t01, c0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) 	fsquare_fsquare_times_inplace(t01, 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) 	fmul_fmul(b1, t01, b1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) 	fsquare_fsquare_times(t01, b1, 50);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) 	a = buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) 	t0 = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) 	b = buf + 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) 	c = buf + 15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) 	fmul_fmul(c, t0, b);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) 	fsquare_fsquare_times(t0, c, 100);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) 	fmul_fmul(t0, t0, c);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) 	fsquare_fsquare_times_inplace(t0, 50);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) 	fmul_fmul(t0, t0, b);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) 	fsquare_fsquare_times_inplace(t0, 5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) 	fmul_fmul(out, t0, a);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) static __always_inline void fsum(u64 *a, u64 *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) 	a[0] += b[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) 	a[1] += b[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) 	a[2] += b[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) 	a[3] += b[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) 	a[4] += b[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) static __always_inline void fdifference(u64 *a, u64 *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) 	u64 tmp[5] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) 	u64 b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) 	u64 b1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) 	u64 b2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) 	u64 b3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) 	u64 b4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) 	memcpy(tmp, b, 5 * sizeof(*b));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) 	b0 = tmp[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) 	b1 = tmp[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) 	b2 = tmp[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) 	b3 = tmp[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) 	b4 = tmp[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) 	tmp[0] = b0 + 0x3fffffffffff68LLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) 	tmp[1] = b1 + 0x3ffffffffffff8LLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) 	tmp[2] = b2 + 0x3ffffffffffff8LLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) 	tmp[3] = b3 + 0x3ffffffffffff8LLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) 	tmp[4] = b4 + 0x3ffffffffffff8LLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) 		u64 xi = a[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) 		u64 yi = tmp[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) 		a[0] = yi - xi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 		u64 xi = a[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) 		u64 yi = tmp[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) 		a[1] = yi - xi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) 		u64 xi = a[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) 		u64 yi = tmp[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) 		a[2] = yi - xi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) 		u64 xi = a[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) 		u64 yi = tmp[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) 		a[3] = yi - xi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) 		u64 xi = a[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) 		u64 yi = tmp[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) 		a[4] = yi - xi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) 	u128 tmp[5];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) 	u128 b4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) 	u128 b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) 	u128 b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) 	u128 b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) 		u64 xi = b[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) 		tmp[0] = ((u128)(xi) * (s));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) 		u64 xi = b[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) 		tmp[1] = ((u128)(xi) * (s));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) 		u64 xi = b[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) 		tmp[2] = ((u128)(xi) * (s));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) 		u64 xi = b[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) 		tmp[3] = ((u128)(xi) * (s));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) 		u64 xi = b[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) 		tmp[4] = ((u128)(xi) * (s));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) 	fproduct_carry_wide_(tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) 	b4 = tmp[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) 	b0 = tmp[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) 	b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) 	b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) 	tmp[4] = b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) 	tmp[0] = b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) 	fproduct_copy_from_wide_(output, tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) 	fmul_fmul(output, a, b);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) static __always_inline void crecip(u64 *output, u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) 	crecip_crecip(output, input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) static __always_inline void point_swap_conditional_step(u64 *a, u64 *b,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) 							u64 swap1, u32 ctr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) 	u32 i = ctr - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) 	u64 ai = a[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) 	u64 bi = b[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) 	u64 x = swap1 & (ai ^ bi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) 	u64 ai1 = ai ^ x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) 	u64 bi1 = bi ^ x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) 	a[i] = ai1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) 	b[i] = bi1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) 	point_swap_conditional_step(a, b, swap1, 5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 	point_swap_conditional_step(a, b, swap1, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) 	point_swap_conditional_step(a, b, swap1, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) 	point_swap_conditional_step(a, b, swap1, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) 	point_swap_conditional_step(a, b, swap1, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) 	u64 swap1 = 0 - iswap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) 	point_swap_conditional5(a, b, swap1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 	point_swap_conditional5(a + 5, b + 5, swap1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) static __always_inline void point_copy(u64 *output, u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) 	memcpy(output, input, 5 * sizeof(*input));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) 	memcpy(output + 5, input + 5, 5 * sizeof(*input));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) 						u64 *pq, u64 *qmqp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) 	u64 *qx = qmqp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) 	u64 *x2 = pp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) 	u64 *z2 = pp + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) 	u64 *x3 = ppq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) 	u64 *z3 = ppq + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) 	u64 *x = p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) 	u64 *z = p + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) 	u64 *xprime = pq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) 	u64 *zprime = pq + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) 	u64 buf[40] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) 	u64 *origx = buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) 	u64 *origxprime0 = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) 	u64 *xxprime0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) 	u64 *zzprime0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) 	u64 *origxprime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) 	xxprime0 = buf + 25;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) 	zzprime0 = buf + 30;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) 	memcpy(origx, x, 5 * sizeof(*x));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) 	fsum(x, z);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) 	fdifference(z, origx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) 	memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) 	fsum(xprime, zprime);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) 	fdifference(zprime, origxprime0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) 	fmul(xxprime0, xprime, z);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) 	fmul(zzprime0, x, zprime);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) 	origxprime = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) 		u64 *xx0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) 		u64 *zz0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) 		u64 *xxprime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) 		u64 *zzprime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) 		u64 *zzzprime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) 		xx0 = buf + 15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) 		zz0 = buf + 20;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) 		xxprime = buf + 25;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) 		zzprime = buf + 30;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) 		zzzprime = buf + 35;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) 		memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) 		fsum(xxprime, zzprime);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) 		fdifference(zzprime, origxprime);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) 		fsquare_fsquare_times(x3, xxprime, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) 		fsquare_fsquare_times(zzzprime, zzprime, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) 		fmul(z3, zzzprime, qx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) 		fsquare_fsquare_times(xx0, x, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) 		fsquare_fsquare_times(zz0, z, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) 		{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) 			u64 *zzz;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) 			u64 *xx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) 			u64 *zz;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) 			u64 scalar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) 			zzz = buf + 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) 			xx = buf + 15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) 			zz = buf + 20;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) 			fmul(x2, xx, zz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) 			fdifference(zz, xx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) 			scalar = 121665;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) 			fscalar(zzz, zz, scalar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) 			fsum(zzz, xx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) 			fmul(z2, zzz, zz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) static __always_inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) 				       u64 *q, u8 byt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) 	u64 bit0 = (u64)(byt >> 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) 	u64 bit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) 	point_swap_conditional(nq, nqpq, bit0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) 	addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) 	bit = (u64)(byt >> 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) 	point_swap_conditional(nq2, nqpq2, bit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) static __always_inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) 					      u64 *nqpq2, u64 *q, u8 byt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) 	u8 byt1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) 	ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) 	byt1 = byt << 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) 	ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) static __always_inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) 				  u64 *q, u8 byt, u32 i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) 	while (i--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) 		ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) 							      nqpq2, q, byt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) 		byt <<= 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) 							  u64 *nqpq, u64 *nq2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) 							  u64 *nqpq2, u64 *q,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) 							  u32 i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) 	while (i--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) 		u8 byte = n1[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) 		ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) 						  byte, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) static void ladder_cmult(u64 *result, u8 *n1, u64 *q)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) 	u64 point_buf[40] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) 	u64 *nq = point_buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) 	u64 *nqpq = point_buf + 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) 	u64 *nq2 = point_buf + 20;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) 	u64 *nqpq2 = point_buf + 30;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) 	point_copy(nqpq, q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) 	nq[0] = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) 	ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) 	point_copy(result, nq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) static __always_inline void format_fexpand(u64 *output, const u8 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) 	const u8 *x00 = input + 6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) 	const u8 *x01 = input + 12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) 	const u8 *x02 = input + 19;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) 	const u8 *x0 = input + 24;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) 	u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) 	i0 = get_unaligned_le64(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) 	i1 = get_unaligned_le64(x00);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) 	i2 = get_unaligned_le64(x01);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) 	i3 = get_unaligned_le64(x02);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) 	i4 = get_unaligned_le64(x0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) 	output0 = i0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) 	output1 = i1 >> 3 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) 	output2 = i2 >> 6 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) 	output3 = i3 >> 1 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) 	output4 = i4 >> 12 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) 	output[0] = output0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) 	output[1] = output1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) 	output[2] = output2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) 	output[3] = output3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) 	output[4] = output4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) static __always_inline void format_fcontract_first_carry_pass(u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) 	u64 t0 = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) 	u64 t1 = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) 	u64 t2 = input[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) 	u64 t3 = input[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) 	u64 t4 = input[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) 	u64 t1_ = t1 + (t0 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) 	u64 t0_ = t0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) 	u64 t2_ = t2 + (t1_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) 	u64 t1__ = t1_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) 	u64 t3_ = t3 + (t2_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) 	u64 t2__ = t2_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) 	u64 t4_ = t4 + (t3_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) 	u64 t3__ = t3_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) 	input[0] = t0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) 	input[1] = t1__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) 	input[2] = t2__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) 	input[3] = t3__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) 	input[4] = t4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) static __always_inline void format_fcontract_first_carry_full(u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) 	format_fcontract_first_carry_pass(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) 	modulo_carry_top(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) static __always_inline void format_fcontract_second_carry_pass(u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) 	u64 t0 = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) 	u64 t1 = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) 	u64 t2 = input[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) 	u64 t3 = input[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) 	u64 t4 = input[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) 	u64 t1_ = t1 + (t0 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) 	u64 t0_ = t0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) 	u64 t2_ = t2 + (t1_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) 	u64 t1__ = t1_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) 	u64 t3_ = t3 + (t2_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) 	u64 t2__ = t2_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) 	u64 t4_ = t4 + (t3_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) 	u64 t3__ = t3_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) 	input[0] = t0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) 	input[1] = t1__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) 	input[2] = t2__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) 	input[3] = t3__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) 	input[4] = t4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) static __always_inline void format_fcontract_second_carry_full(u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) 	u64 i0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) 	u64 i1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) 	u64 i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) 	u64 i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) 	format_fcontract_second_carry_pass(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) 	modulo_carry_top(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) 	i0 = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) 	i1 = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) 	i0_ = i0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) 	i1_ = i1 + (i0 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) 	input[0] = i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) 	input[1] = i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) static __always_inline void format_fcontract_trim(u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) 	u64 a0 = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) 	u64 a1 = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) 	u64 a2 = input[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) 	u64 a3 = input[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) 	u64 a4 = input[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) 	u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) 	u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) 	u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) 	u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) 	u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) 	u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) 	u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) 	u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) 	u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) 	u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) 	u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) 	input[0] = a0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) 	input[1] = a1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) 	input[2] = a2_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) 	input[3] = a3_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) 	input[4] = a4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) static __always_inline void format_fcontract_store(u8 *output, u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) 	u64 t0 = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) 	u64 t1 = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) 	u64 t2 = input[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) 	u64 t3 = input[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) 	u64 t4 = input[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) 	u64 o0 = t1 << 51 | t0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) 	u64 o1 = t2 << 38 | t1 >> 13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) 	u64 o2 = t3 << 25 | t2 >> 26;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) 	u64 o3 = t4 << 12 | t3 >> 39;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) 	u8 *b0 = output;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) 	u8 *b1 = output + 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) 	u8 *b2 = output + 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) 	u8 *b3 = output + 24;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) 	put_unaligned_le64(o0, b0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) 	put_unaligned_le64(o1, b1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) 	put_unaligned_le64(o2, b2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) 	put_unaligned_le64(o3, b3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) static __always_inline void format_fcontract(u8 *output, u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) 	format_fcontract_first_carry_full(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) 	format_fcontract_second_carry_full(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) 	format_fcontract_trim(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) 	format_fcontract_store(output, input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) 	u64 *x = point;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) 	u64 *z = point + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) 	u64 buf[10] __aligned(32) = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) 	u64 *zmone = buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) 	u64 *sc = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) 	crecip(zmone, z);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) 	fmul(sc, x, zmone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) 	format_fcontract(scalar, sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) 			const u8 secret[CURVE25519_KEY_SIZE],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) 			const u8 basepoint[CURVE25519_KEY_SIZE])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) 	u64 buf0[10] __aligned(32) = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) 	u64 *x0 = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) 	u64 *z = buf0 + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) 	u64 *q;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) 	format_fexpand(x0, basepoint);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) 	z[0] = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) 	q = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) 		u8 e[32] __aligned(32) = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) 		u8 *scalar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) 		memcpy(e, secret, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) 		curve25519_clamp_secret(e);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) 		scalar = e;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) 		{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) 			u64 buf[15] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) 			u64 *nq = buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) 			u64 *x = nq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) 			x[0] = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) 			ladder_cmult(nq, scalar, q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) 			format_scalar_of_point(mypublic, nq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) 			memzero_explicit(buf, sizeof(buf));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) 		memzero_explicit(e, sizeof(e));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) 	memzero_explicit(buf0, sizeof(buf0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) }