^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright 2018 Google LLC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Author: Eric Biggers <ebiggers@google.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #define PASS0_SUMS %xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #define PASS1_SUMS %xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #define PASS2_SUMS %xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #define PASS3_SUMS %xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #define K0 %xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #define K1 %xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #define K2 %xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #define K3 %xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #define T0 %xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #define T1 %xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #define T2 %xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #define T3 %xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #define T4 %xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #define T5 %xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #define T6 %xmm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #define T7 %xmm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #define KEY %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #define MESSAGE %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #define MESSAGE_LEN %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #define HASH %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) .macro _nh_stride k0, k1, k2, k3, offset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) // Load next message stride
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) movdqu \offset(MESSAGE), T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) // Load next key stride
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) movdqu \offset(KEY), \k3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) // Add message words to key words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) movdqa T1, T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) movdqa T1, T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) paddd T1, \k0 // reuse k0 to avoid a move
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) paddd \k1, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) paddd \k2, T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) paddd \k3, T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) // Multiply 32x32 => 64 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) pshufd $0x10, \k0, T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) pshufd $0x32, \k0, \k0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) pshufd $0x10, T1, T5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) pshufd $0x32, T1, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) pshufd $0x10, T2, T6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) pshufd $0x32, T2, T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) pshufd $0x10, T3, T7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) pshufd $0x32, T3, T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) pmuludq T4, \k0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) pmuludq T5, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) pmuludq T6, T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) pmuludq T7, T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) paddq \k0, PASS0_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) paddq T1, PASS1_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) paddq T2, PASS2_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) paddq T3, PASS3_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * u8 hash[NH_HASH_BYTES])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * It's guaranteed that message_len % 16 == 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) SYM_FUNC_START(nh_sse2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) movdqu 0x00(KEY), K0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) movdqu 0x10(KEY), K1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) movdqu 0x20(KEY), K2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) add $0x30, KEY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) pxor PASS0_SUMS, PASS0_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) pxor PASS1_SUMS, PASS1_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) pxor PASS2_SUMS, PASS2_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) pxor PASS3_SUMS, PASS3_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) sub $0x40, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) jl .Lloop4_done
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) .Lloop4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) _nh_stride K0, K1, K2, K3, 0x00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) _nh_stride K1, K2, K3, K0, 0x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) _nh_stride K2, K3, K0, K1, 0x20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) _nh_stride K3, K0, K1, K2, 0x30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) add $0x40, KEY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) add $0x40, MESSAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) sub $0x40, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) jge .Lloop4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) .Lloop4_done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) and $0x3f, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) jz .Ldone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) _nh_stride K0, K1, K2, K3, 0x00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) sub $0x10, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) jz .Ldone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) _nh_stride K1, K2, K3, K0, 0x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) sub $0x10, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) jz .Ldone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) _nh_stride K2, K3, K0, K1, 0x20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) .Ldone:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) // Sum the accumulators for each pass, then store the sums to 'hash'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) movdqa PASS0_SUMS, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) movdqa PASS2_SUMS, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) paddq PASS0_SUMS, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) paddq PASS2_SUMS, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) movdqu T0, 0x00(HASH)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) movdqu T1, 0x10(HASH)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) SYM_FUNC_END(nh_sse2)