Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3)  * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5)  * Copyright 2018 Google LLC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7)  * Author: Eric Biggers <ebiggers@google.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12) #define		PASS0_SUMS	%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13) #define		PASS1_SUMS	%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14) #define		PASS2_SUMS	%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15) #define		PASS3_SUMS	%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16) #define		K0		%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17) #define		K0_XMM		%xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18) #define		K1		%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) #define		K1_XMM		%xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) #define		K2		%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) #define		K2_XMM		%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) #define		K3		%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) #define		K3_XMM		%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) #define		T0		%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) #define		T1		%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) #define		T2		%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) #define		T2_XMM		%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) #define		T3		%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) #define		T3_XMM		%xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) #define		T4		%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) #define		T5		%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) #define		T6		%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) #define		T7		%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) #define		KEY		%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) #define		MESSAGE		%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) #define		MESSAGE_LEN	%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) #define		HASH		%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) .macro _nh_2xstride	k0, k1, k2, k3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) 	// Add message words to key words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) 	vpaddd		\k0, T3, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) 	vpaddd		\k1, T3, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) 	vpaddd		\k2, T3, T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) 	vpaddd		\k3, T3, T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) 	// Multiply 32x32 => 64 and accumulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) 	vpshufd		$0x10, T0, T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) 	vpshufd		$0x32, T0, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) 	vpshufd		$0x10, T1, T5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) 	vpshufd		$0x32, T1, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) 	vpshufd		$0x10, T2, T6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) 	vpshufd		$0x32, T2, T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) 	vpshufd		$0x10, T3, T7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) 	vpshufd		$0x32, T3, T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) 	vpmuludq	T4, T0, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) 	vpmuludq	T5, T1, T1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) 	vpmuludq	T6, T2, T2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) 	vpmuludq	T7, T3, T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) 	vpaddq		T0, PASS0_SUMS, PASS0_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) 	vpaddq		T1, PASS1_SUMS, PASS1_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) 	vpaddq		T2, PASS2_SUMS, PASS2_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) 	vpaddq		T3, PASS3_SUMS, PASS3_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67)  * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68)  *		u8 hash[NH_HASH_BYTES])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70)  * It's guaranteed that message_len % 16 == 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) SYM_FUNC_START(nh_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) 	vmovdqu		0x00(KEY), K0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) 	vmovdqu		0x10(KEY), K1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) 	add		$0x20, KEY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) 	vpxor		PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) 	vpxor		PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) 	vpxor		PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) 	vpxor		PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) 	sub		$0x40, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) 	jl		.Lloop4_done
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) .Lloop4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) 	vmovdqu		(MESSAGE), T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) 	vmovdqu		0x00(KEY), K2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) 	vmovdqu		0x10(KEY), K3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) 	_nh_2xstride	K0, K1, K2, K3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) 	vmovdqu		0x20(MESSAGE), T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) 	vmovdqu		0x20(KEY), K0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) 	vmovdqu		0x30(KEY), K1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) 	_nh_2xstride	K2, K3, K0, K1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) 	add		$0x40, MESSAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) 	add		$0x40, KEY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) 	sub		$0x40, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) 	jge		.Lloop4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) .Lloop4_done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) 	and		$0x3f, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) 	jz		.Ldone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) 	cmp		$0x20, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) 	jl		.Llast
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) 	// 2 or 3 strides remain; do 2 more.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 	vmovdqu		(MESSAGE), T3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) 	vmovdqu		0x00(KEY), K2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) 	vmovdqu		0x10(KEY), K3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) 	_nh_2xstride	K0, K1, K2, K3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) 	add		$0x20, MESSAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 	add		$0x20, KEY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) 	sub		$0x20, MESSAGE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) 	jz		.Ldone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 	vmovdqa		K2, K0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) 	vmovdqa		K3, K1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) .Llast:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) 	// Last stride.  Zero the high 128 bits of the message and keys so they
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) 	// don't affect the result when processing them like 2 strides.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) 	vmovdqu		(MESSAGE), T3_XMM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) 	vmovdqa		K0_XMM, K0_XMM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) 	vmovdqa		K1_XMM, K1_XMM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) 	vmovdqu		0x00(KEY), K2_XMM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) 	vmovdqu		0x10(KEY), K3_XMM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) 	_nh_2xstride	K0, K1, K2, K3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) .Ldone:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) 	// Sum the accumulators for each pass, then store the sums to 'hash'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) 	// PASS0_SUMS is (0A 0B 0C 0D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) 	// PASS1_SUMS is (1A 1B 1C 1D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) 	// PASS2_SUMS is (2A 2B 2C 2D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) 	// PASS3_SUMS is (3A 3B 3C 3D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) 	// We need the horizontal sums:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) 	//     (0A + 0B + 0C + 0D,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) 	//	1A + 1B + 1C + 1D,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) 	//	2A + 2B + 2C + 2D,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) 	//	3A + 3B + 3C + 3D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) 	vpunpcklqdq	PASS1_SUMS, PASS0_SUMS, T0	// T0 = (0A 1A 0C 1C)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) 	vpunpckhqdq	PASS1_SUMS, PASS0_SUMS, T1	// T1 = (0B 1B 0D 1D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) 	vpunpcklqdq	PASS3_SUMS, PASS2_SUMS, T2	// T2 = (2A 3A 2C 3C)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) 	vpunpckhqdq	PASS3_SUMS, PASS2_SUMS, T3	// T3 = (2B 3B 2D 3D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) 	vinserti128	$0x1, T2_XMM, T0, T4		// T4 = (0A 1A 2A 3A)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 	vinserti128	$0x1, T3_XMM, T1, T5		// T5 = (0B 1B 2B 3B)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 	vperm2i128	$0x31, T2, T0, T0		// T0 = (0C 1C 2C 3C)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 	vperm2i128	$0x31, T3, T1, T1		// T1 = (0D 1D 2D 3D)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 	vpaddq		T5, T4, T4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) 	vpaddq		T1, T0, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 	vpaddq		T4, T0, T0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) 	vmovdqu		T0, (HASH)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) SYM_FUNC_END(nh_avx2)