^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright 2018 Google LLC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Author: Eric Biggers <ebiggers@google.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #define PASS0_SUMS %ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #define PASS1_SUMS %ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #define PASS2_SUMS %ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #define PASS3_SUMS %ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #define K0 %ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #define K0_XMM %xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #define K1 %ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #define K1_XMM %xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #define K2 %ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #define K2_XMM %xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #define K3 %ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #define K3_XMM %xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #define T0 %ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #define T1 %ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #define T2 %ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #define T2_XMM %xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #define T3 %ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #define T3_XMM %xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #define T4 %ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #define T5 %ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #define T6 %ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #define T7 %ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #define KEY %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #define MESSAGE %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #define MESSAGE_LEN %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #define HASH %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) .macro _nh_2xstride k0, k1, k2, k3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) // Add message words to key words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) vpaddd \k0, T3, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) vpaddd \k1, T3, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) vpaddd \k2, T3, T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) vpaddd \k3, T3, T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) // Multiply 32x32 => 64 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) vpshufd $0x10, T0, T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) vpshufd $0x32, T0, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) vpshufd $0x10, T1, T5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) vpshufd $0x32, T1, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) vpshufd $0x10, T2, T6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) vpshufd $0x32, T2, T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) vpshufd $0x10, T3, T7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) vpshufd $0x32, T3, T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) vpmuludq T4, T0, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) vpmuludq T5, T1, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) vpmuludq T6, T2, T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) vpmuludq T7, T3, T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) vpaddq T0, PASS0_SUMS, PASS0_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) vpaddq T1, PASS1_SUMS, PASS1_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) vpaddq T2, PASS2_SUMS, PASS2_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) vpaddq T3, PASS3_SUMS, PASS3_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * u8 hash[NH_HASH_BYTES])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * It's guaranteed that message_len % 16 == 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) SYM_FUNC_START(nh_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) vmovdqu 0x00(KEY), K0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) vmovdqu 0x10(KEY), K1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) add $0x20, KEY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) sub $0x40, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) jl .Lloop4_done
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) .Lloop4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) vmovdqu (MESSAGE), T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) vmovdqu 0x00(KEY), K2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) vmovdqu 0x10(KEY), K3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) _nh_2xstride K0, K1, K2, K3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) vmovdqu 0x20(MESSAGE), T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) vmovdqu 0x20(KEY), K0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) vmovdqu 0x30(KEY), K1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) _nh_2xstride K2, K3, K0, K1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) add $0x40, MESSAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) add $0x40, KEY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) sub $0x40, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) jge .Lloop4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) .Lloop4_done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) and $0x3f, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) jz .Ldone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) cmp $0x20, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) jl .Llast
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) // 2 or 3 strides remain; do 2 more.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) vmovdqu (MESSAGE), T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) vmovdqu 0x00(KEY), K2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) vmovdqu 0x10(KEY), K3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) _nh_2xstride K0, K1, K2, K3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) add $0x20, MESSAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) add $0x20, KEY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) sub $0x20, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) jz .Ldone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) vmovdqa K2, K0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) vmovdqa K3, K1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) .Llast:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) // Last stride. Zero the high 128 bits of the message and keys so they
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) // don't affect the result when processing them like 2 strides.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) vmovdqu (MESSAGE), T3_XMM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) vmovdqa K0_XMM, K0_XMM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) vmovdqa K1_XMM, K1_XMM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) vmovdqu 0x00(KEY), K2_XMM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) vmovdqu 0x10(KEY), K3_XMM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) _nh_2xstride K0, K1, K2, K3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) .Ldone:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) // Sum the accumulators for each pass, then store the sums to 'hash'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) // PASS0_SUMS is (0A 0B 0C 0D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) // PASS1_SUMS is (1A 1B 1C 1D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) // PASS2_SUMS is (2A 2B 2C 2D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) // PASS3_SUMS is (3A 3B 3C 3D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) // We need the horizontal sums:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) // (0A + 0B + 0C + 0D,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) // 1A + 1B + 1C + 1D,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) // 2A + 2B + 2C + 2D,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) // 3A + 3B + 3C + 3D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) vpunpcklqdq PASS1_SUMS, PASS0_SUMS, T0 // T0 = (0A 1A 0C 1C)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) vpunpckhqdq PASS1_SUMS, PASS0_SUMS, T1 // T1 = (0B 1B 0D 1D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) vpunpcklqdq PASS3_SUMS, PASS2_SUMS, T2 // T2 = (2A 3A 2C 3C)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) vpunpckhqdq PASS3_SUMS, PASS2_SUMS, T3 // T3 = (2B 3B 2D 3D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) vinserti128 $0x1, T2_XMM, T0, T4 // T4 = (0A 1A 2A 3A)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) vinserti128 $0x1, T3_XMM, T1, T5 // T5 = (0B 1B 2B 3B)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) vperm2i128 $0x31, T2, T0, T0 // T0 = (0C 1C 2C 3C)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) vperm2i128 $0x31, T3, T1, T1 // T1 = (0D 1D 2D 3D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) vpaddq T5, T4, T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) vpaddq T1, T0, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) vpaddq T4, T0, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) vmovdqu T0, (HASH)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) SYM_FUNC_END(nh_avx2)