^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0 OR MIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * This is a machine-generated formally verified implementation of Curve25519
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * generated, it has been tweaked to be suitable for use in the kernel. It is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * optimized for 64-bit machines that can efficiently work with 128-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * integer types.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <asm/unaligned.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <crypto/curve25519.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) typedef __uint128_t u128;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) static __always_inline u64 u64_eq_mask(u64 a, u64 b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) u64 x = a ^ b;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) u64 minus_x = ~x + (u64)1U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) u64 x_or_minus_x = x | minus_x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) u64 xnx = x_or_minus_x >> (u32)63U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) u64 c = xnx - (u64)1U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) return c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) static __always_inline u64 u64_gte_mask(u64 a, u64 b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) u64 x = a;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) u64 y = b;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) u64 x_xor_y = x ^ y;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) u64 x_sub_y = x - y;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) u64 x_sub_y_xor_y = x_sub_y ^ y;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) u64 q = x_xor_y | x_sub_y_xor_y;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) u64 x_xor_q = x ^ q;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) u64 x_xor_q_ = x_xor_q >> (u32)63U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) u64 c = x_xor_q_ - (u64)1U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) return c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) static __always_inline void modulo_carry_top(u64 *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) u64 b4 = b[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) u64 b0 = b[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) u64 b4_ = b4 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) u64 b0_ = b0 + 19 * (b4 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) b[4] = b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) b[0] = b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) u128 xi = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) output[0] = ((u64)(xi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) u128 xi = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) output[1] = ((u64)(xi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) u128 xi = input[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) output[2] = ((u64)(xi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) u128 xi = input[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) output[3] = ((u64)(xi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) u128 xi = input[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) output[4] = ((u64)(xi));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) static __always_inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) output[0] += (u128)input[0] * s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) output[1] += (u128)input[1] * s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) output[2] += (u128)input[2] * s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) output[3] += (u128)input[3] * s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) output[4] += (u128)input[4] * s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) static __always_inline void fproduct_carry_wide_(u128 *tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) u32 ctr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) u128 tctr = tmp[ctr];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) u128 tctrp1 = tmp[ctr + 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) u128 c = ((tctr) >> (51));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) tmp[ctr] = ((u128)(r0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) tmp[ctr + 1] = ((tctrp1) + (c));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) u32 ctr = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) u128 tctr = tmp[ctr];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) u128 tctrp1 = tmp[ctr + 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) u128 c = ((tctr) >> (51));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) tmp[ctr] = ((u128)(r0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) tmp[ctr + 1] = ((tctrp1) + (c));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) u32 ctr = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) u128 tctr = tmp[ctr];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) u128 tctrp1 = tmp[ctr + 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) u128 c = ((tctr) >> (51));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) tmp[ctr] = ((u128)(r0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) tmp[ctr + 1] = ((tctrp1) + (c));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) u32 ctr = 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) u128 tctr = tmp[ctr];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) u128 tctrp1 = tmp[ctr + 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) u128 c = ((tctr) >> (51));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) tmp[ctr] = ((u128)(r0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) tmp[ctr + 1] = ((tctrp1) + (c));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) static __always_inline void fmul_shift_reduce(u64 *output)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) u64 tmp = output[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) u64 b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) u32 ctr = 5 - 0 - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) u64 z = output[ctr - 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) output[ctr] = z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) u32 ctr = 5 - 1 - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) u64 z = output[ctr - 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) output[ctr] = z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) u32 ctr = 5 - 2 - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) u64 z = output[ctr - 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) output[ctr] = z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) u32 ctr = 5 - 3 - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) u64 z = output[ctr - 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) output[ctr] = z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) output[0] = tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) b0 = output[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) output[0] = 19 * b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) u64 *input21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) u32 i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) u64 input2i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) u64 input2i = input21[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) fproduct_sum_scalar_multiplication_(output, input, input2i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) fmul_shift_reduce(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) u64 input2i = input21[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) fproduct_sum_scalar_multiplication_(output, input, input2i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) fmul_shift_reduce(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) u64 input2i = input21[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) fproduct_sum_scalar_multiplication_(output, input, input2i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) fmul_shift_reduce(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) u64 input2i = input21[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) fproduct_sum_scalar_multiplication_(output, input, input2i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) fmul_shift_reduce(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) i = 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) input2i = input21[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) fproduct_sum_scalar_multiplication_(output, input, input2i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) u128 b4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) u128 b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) u128 b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) u128 b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) u64 i0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) u64 i1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) u64 i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) u64 i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) u128 t[5] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) fmul_mul_shift_reduce_(t, tmp, input21);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) fproduct_carry_wide_(t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) b4 = t[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) b0 = t[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) t[4] = b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) t[0] = b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) fproduct_copy_from_wide_(output, t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) i0 = output[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) i1 = output[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) i0_ = i0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) i1_ = i1 + (i0 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) output[0] = i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) output[1] = i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) u64 r0 = output[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) u64 r1 = output[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) u64 r2 = output[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) u64 r3 = output[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) u64 r4 = output[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) u64 d0 = r0 * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) u64 d1 = r1 * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) u64 d2 = r2 * 2 * 19;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) u64 d419 = r4 * 19;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) u64 d4 = d419 * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) (((u128)(d2) * (r3))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) (((u128)(r3 * 19) * (r3))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) (((u128)(d4) * (r3))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) (((u128)(r4) * (d419))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) (((u128)(r2) * (r2))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) tmp[0] = s0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) tmp[1] = s1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) tmp[2] = s2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) tmp[3] = s3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) tmp[4] = s4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) u128 b4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) u128 b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) u128 b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) u128 b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) u64 i0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) u64 i1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) u64 i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) u64 i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) fsquare_fsquare__(tmp, output);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) fproduct_carry_wide_(tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) b4 = tmp[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) b0 = tmp[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) tmp[4] = b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) tmp[0] = b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) fproduct_copy_from_wide_(output, tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) i0 = output[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) i1 = output[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) i0_ = i0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) i1_ = i1 + (i0 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) output[0] = i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) output[1] = i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) u32 count1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) u32 i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) fsquare_fsquare_(tmp, output);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) for (i = 1; i < count1; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) fsquare_fsquare_(tmp, output);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) u32 count1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) u128 t[5];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) memcpy(output, input, 5 * sizeof(*input));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) fsquare_fsquare_times_(output, t, count1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) u32 count1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) u128 t[5];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) fsquare_fsquare_times_(output, t, count1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) static __always_inline void crecip_crecip(u64 *out, u64 *z)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) u64 buf[20] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) u64 *a0 = buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) u64 *t00 = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) u64 *b0 = buf + 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) u64 *t01;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) u64 *b1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) u64 *c0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) u64 *a;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) u64 *t0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) u64 *b;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) u64 *c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) fsquare_fsquare_times(a0, z, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) fsquare_fsquare_times(t00, a0, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) fmul_fmul(b0, t00, z);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) fmul_fmul(a0, b0, a0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) fsquare_fsquare_times(t00, a0, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) fmul_fmul(b0, t00, b0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) fsquare_fsquare_times(t00, b0, 5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) t01 = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) b1 = buf + 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) c0 = buf + 15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) fmul_fmul(b1, t01, b1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) fsquare_fsquare_times(t01, b1, 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) fmul_fmul(c0, t01, b1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) fsquare_fsquare_times(t01, c0, 20);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) fmul_fmul(t01, t01, c0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) fsquare_fsquare_times_inplace(t01, 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) fmul_fmul(b1, t01, b1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) fsquare_fsquare_times(t01, b1, 50);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) a = buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) t0 = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) b = buf + 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) c = buf + 15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) fmul_fmul(c, t0, b);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) fsquare_fsquare_times(t0, c, 100);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) fmul_fmul(t0, t0, c);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) fsquare_fsquare_times_inplace(t0, 50);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) fmul_fmul(t0, t0, b);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) fsquare_fsquare_times_inplace(t0, 5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) fmul_fmul(out, t0, a);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) static __always_inline void fsum(u64 *a, u64 *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) a[0] += b[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) a[1] += b[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) a[2] += b[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) a[3] += b[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) a[4] += b[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) static __always_inline void fdifference(u64 *a, u64 *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) u64 tmp[5] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) u64 b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) u64 b1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) u64 b2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) u64 b3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) u64 b4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) memcpy(tmp, b, 5 * sizeof(*b));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) b0 = tmp[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) b1 = tmp[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) b2 = tmp[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) b3 = tmp[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) b4 = tmp[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) tmp[0] = b0 + 0x3fffffffffff68LLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) tmp[1] = b1 + 0x3ffffffffffff8LLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) tmp[2] = b2 + 0x3ffffffffffff8LLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) tmp[3] = b3 + 0x3ffffffffffff8LLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) tmp[4] = b4 + 0x3ffffffffffff8LLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) u64 xi = a[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) u64 yi = tmp[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) a[0] = yi - xi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) u64 xi = a[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) u64 yi = tmp[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) a[1] = yi - xi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) u64 xi = a[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) u64 yi = tmp[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) a[2] = yi - xi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) u64 xi = a[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) u64 yi = tmp[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) a[3] = yi - xi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) u64 xi = a[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) u64 yi = tmp[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) a[4] = yi - xi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) u128 tmp[5];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) u128 b4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) u128 b0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) u128 b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) u128 b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) u64 xi = b[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) tmp[0] = ((u128)(xi) * (s));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) u64 xi = b[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) tmp[1] = ((u128)(xi) * (s));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) u64 xi = b[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) tmp[2] = ((u128)(xi) * (s));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) u64 xi = b[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) tmp[3] = ((u128)(xi) * (s));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) u64 xi = b[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) tmp[4] = ((u128)(xi) * (s));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) fproduct_carry_wide_(tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) b4 = tmp[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) b0 = tmp[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) tmp[4] = b4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) tmp[0] = b0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) fproduct_copy_from_wide_(output, tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) fmul_fmul(output, a, b);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) static __always_inline void crecip(u64 *output, u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) crecip_crecip(output, input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) static __always_inline void point_swap_conditional_step(u64 *a, u64 *b,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) u64 swap1, u32 ctr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) u32 i = ctr - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) u64 ai = a[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) u64 bi = b[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) u64 x = swap1 & (ai ^ bi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) u64 ai1 = ai ^ x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) u64 bi1 = bi ^ x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) a[i] = ai1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) b[i] = bi1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) point_swap_conditional_step(a, b, swap1, 5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) point_swap_conditional_step(a, b, swap1, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) point_swap_conditional_step(a, b, swap1, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) point_swap_conditional_step(a, b, swap1, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) point_swap_conditional_step(a, b, swap1, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) u64 swap1 = 0 - iswap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) point_swap_conditional5(a, b, swap1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) point_swap_conditional5(a + 5, b + 5, swap1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) static __always_inline void point_copy(u64 *output, u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) memcpy(output, input, 5 * sizeof(*input));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) memcpy(output + 5, input + 5, 5 * sizeof(*input));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) u64 *pq, u64 *qmqp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) u64 *qx = qmqp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) u64 *x2 = pp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) u64 *z2 = pp + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) u64 *x3 = ppq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) u64 *z3 = ppq + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) u64 *x = p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) u64 *z = p + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) u64 *xprime = pq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) u64 *zprime = pq + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) u64 buf[40] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) u64 *origx = buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) u64 *origxprime0 = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) u64 *xxprime0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) u64 *zzprime0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) u64 *origxprime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) xxprime0 = buf + 25;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) zzprime0 = buf + 30;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) memcpy(origx, x, 5 * sizeof(*x));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) fsum(x, z);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) fdifference(z, origx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) fsum(xprime, zprime);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) fdifference(zprime, origxprime0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) fmul(xxprime0, xprime, z);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) fmul(zzprime0, x, zprime);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) origxprime = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) u64 *xx0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) u64 *zz0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) u64 *xxprime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) u64 *zzprime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) u64 *zzzprime;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) xx0 = buf + 15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) zz0 = buf + 20;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) xxprime = buf + 25;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) zzprime = buf + 30;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) zzzprime = buf + 35;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) fsum(xxprime, zzprime);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) fdifference(zzprime, origxprime);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) fsquare_fsquare_times(x3, xxprime, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) fsquare_fsquare_times(zzzprime, zzprime, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) fmul(z3, zzzprime, qx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) fsquare_fsquare_times(xx0, x, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) fsquare_fsquare_times(zz0, z, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) u64 *zzz;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) u64 *xx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) u64 *zz;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) u64 scalar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) zzz = buf + 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) xx = buf + 15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) zz = buf + 20;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) fmul(x2, xx, zz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) fdifference(zz, xx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) scalar = 121665;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) fscalar(zzz, zz, scalar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) fsum(zzz, xx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) fmul(z2, zzz, zz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) static __always_inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) u64 *q, u8 byt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) u64 bit0 = (u64)(byt >> 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) u64 bit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) point_swap_conditional(nq, nqpq, bit0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) bit = (u64)(byt >> 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) point_swap_conditional(nq2, nqpq2, bit);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) static __always_inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) u64 *nqpq2, u64 *q, u8 byt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) u8 byt1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) byt1 = byt << 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) static __always_inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) u64 *q, u8 byt, u32 i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) while (i--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) nqpq2, q, byt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) byt <<= 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) u64 *nqpq, u64 *nq2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) u64 *nqpq2, u64 *q,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) u32 i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) while (i--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) u8 byte = n1[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) byte, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) static void ladder_cmult(u64 *result, u8 *n1, u64 *q)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) u64 point_buf[40] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) u64 *nq = point_buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) u64 *nqpq = point_buf + 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) u64 *nq2 = point_buf + 20;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) u64 *nqpq2 = point_buf + 30;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) point_copy(nqpq, q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) nq[0] = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) point_copy(result, nq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) static __always_inline void format_fexpand(u64 *output, const u8 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) const u8 *x00 = input + 6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) const u8 *x01 = input + 12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) const u8 *x02 = input + 19;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) const u8 *x0 = input + 24;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) i0 = get_unaligned_le64(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) i1 = get_unaligned_le64(x00);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) i2 = get_unaligned_le64(x01);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) i3 = get_unaligned_le64(x02);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) i4 = get_unaligned_le64(x0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) output0 = i0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) output1 = i1 >> 3 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) output2 = i2 >> 6 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) output3 = i3 >> 1 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) output4 = i4 >> 12 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) output[0] = output0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) output[1] = output1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) output[2] = output2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) output[3] = output3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) output[4] = output4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) static __always_inline void format_fcontract_first_carry_pass(u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) u64 t0 = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) u64 t1 = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) u64 t2 = input[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) u64 t3 = input[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) u64 t4 = input[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) u64 t1_ = t1 + (t0 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) u64 t0_ = t0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) u64 t2_ = t2 + (t1_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) u64 t1__ = t1_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) u64 t3_ = t3 + (t2_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) u64 t2__ = t2_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) u64 t4_ = t4 + (t3_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) u64 t3__ = t3_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) input[0] = t0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) input[1] = t1__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) input[2] = t2__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) input[3] = t3__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) input[4] = t4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) static __always_inline void format_fcontract_first_carry_full(u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) format_fcontract_first_carry_pass(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) modulo_carry_top(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) static __always_inline void format_fcontract_second_carry_pass(u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) u64 t0 = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) u64 t1 = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) u64 t2 = input[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) u64 t3 = input[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) u64 t4 = input[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) u64 t1_ = t1 + (t0 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) u64 t0_ = t0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) u64 t2_ = t2 + (t1_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) u64 t1__ = t1_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) u64 t3_ = t3 + (t2_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) u64 t2__ = t2_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) u64 t4_ = t4 + (t3_ >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) u64 t3__ = t3_ & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) input[0] = t0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) input[1] = t1__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) input[2] = t2__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) input[3] = t3__;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) input[4] = t4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) static __always_inline void format_fcontract_second_carry_full(u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) u64 i0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) u64 i1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) u64 i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) u64 i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) format_fcontract_second_carry_pass(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) modulo_carry_top(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) i0 = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) i1 = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) i0_ = i0 & 0x7ffffffffffffLLU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) i1_ = i1 + (i0 >> 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) input[0] = i0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) input[1] = i1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) static __always_inline void format_fcontract_trim(u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) u64 a0 = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) u64 a1 = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) u64 a2 = input[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) u64 a3 = input[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) u64 a4 = input[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) input[0] = a0_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) input[1] = a1_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) input[2] = a2_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) input[3] = a3_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) input[4] = a4_;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) static __always_inline void format_fcontract_store(u8 *output, u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) u64 t0 = input[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) u64 t1 = input[1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) u64 t2 = input[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) u64 t3 = input[3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) u64 t4 = input[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) u64 o0 = t1 << 51 | t0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) u64 o1 = t2 << 38 | t1 >> 13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) u64 o2 = t3 << 25 | t2 >> 26;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) u64 o3 = t4 << 12 | t3 >> 39;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) u8 *b0 = output;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) u8 *b1 = output + 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) u8 *b2 = output + 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) u8 *b3 = output + 24;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) put_unaligned_le64(o0, b0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) put_unaligned_le64(o1, b1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) put_unaligned_le64(o2, b2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) put_unaligned_le64(o3, b3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) static __always_inline void format_fcontract(u8 *output, u64 *input)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) format_fcontract_first_carry_full(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) format_fcontract_second_carry_full(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) format_fcontract_trim(input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) format_fcontract_store(output, input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) u64 *x = point;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) u64 *z = point + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) u64 buf[10] __aligned(32) = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) u64 *zmone = buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) u64 *sc = buf + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) crecip(zmone, z);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) fmul(sc, x, zmone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) format_fcontract(scalar, sc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) const u8 secret[CURVE25519_KEY_SIZE],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) const u8 basepoint[CURVE25519_KEY_SIZE])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) u64 buf0[10] __aligned(32) = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) u64 *x0 = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) u64 *z = buf0 + 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) u64 *q;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) format_fexpand(x0, basepoint);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) z[0] = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) q = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) u8 e[32] __aligned(32) = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) u8 *scalar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) memcpy(e, secret, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) curve25519_clamp_secret(e);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) scalar = e;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) u64 buf[15] = { 0 };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) u64 *nq = buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) u64 *x = nq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) x[0] = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) ladder_cmult(nq, scalar, q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) format_scalar_of_point(mypublic, nq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) memzero_explicit(buf, sizeof(buf));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) memzero_explicit(e, sizeof(e));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) memzero_explicit(buf0, sizeof(buf0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) }