Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) # Implement fast SHA-256 with AVX2 instructions. (x86_64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4) # Copyright (C) 2013 Intel Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6) # Authors:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7) #     James Guilford <james.guilford@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8) #     Kirk Yap <kirk.s.yap@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9) #     Tim Chen <tim.c.chen@linux.intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11) # This software is available to you under a choice of one of two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12) # licenses.  You may choose to be licensed under the terms of the GNU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13) # General Public License (GPL) Version 2, available from the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14) # COPYING in the main directory of this source tree, or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15) # OpenIB.org BSD license below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17) #     Redistribution and use in source and binary forms, with or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18) #     without modification, are permitted provided that the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) #     conditions are met:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) #      - Redistributions of source code must retain the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) #        copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) #        disclaimer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) #      - Redistributions in binary form must reproduce the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) #        copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) #        disclaimer in the documentation and/or other materials
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) #        provided with the distribution.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) # SOFTWARE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) # This code is described in an Intel White-Paper:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) # "Fast SHA-256 Implementations on Intel Architecture Processors"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) # To find it, surf to http://www.intel.com/p/en_US/embedded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) # and search for that title.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) # This code schedules 2 blocks at a time, with 4 lanes per block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) ## assume buffers not aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) #define	VMOVDQ vmovdqu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) ################################ Define Macros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) # addm [mem], reg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) # Add reg to mem using reg-mem add and store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) .macro addm p1 p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) 	add	\p1, \p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) 	mov	\p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) ################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) X0 = %ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68) X1 = %ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69) X2 = %ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) X3 = %ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) # XMM versions of above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) XWORD0 = %xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) XWORD1 = %xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) XWORD2 = %xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) XWORD3 = %xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) XTMP0 = %ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) XTMP1 = %ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) XTMP2 = %ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) XTMP3 = %ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) XTMP4 = %ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) XFER  = %ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) XTMP5 = %ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) BYTE_FLIP_MASK = %ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) NUM_BLKS = %rdx	# 3rd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) INP	= %rsi  # 2nd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) CTX	= %rdi	# 1st arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) c	= %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) d	= %r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) e       = %edx	# clobbers NUM_BLKS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) y3	= %esi	# clobbers INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) SRND	= CTX	# SRND is same register as CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) a = %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) b = %ebx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) f = %r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) g = %r10d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) h = %r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) old_h = %r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) T1 = %r12d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) y0 = %r13d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) y1 = %r14d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) y2 = %r15d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) _XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) _XMM_SAVE_SIZE	= 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) _INP_END_SIZE	= 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) _INP_SIZE	= 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) _CTX_SIZE	= 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) _RSP_SIZE	= 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) _XFER		= 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) _XMM_SAVE	= _XFER     + _XFER_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) _INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) _INP		= _INP_END  + _INP_END_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) _CTX		= _INP      + _INP_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) _RSP		= _CTX      + _CTX_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) STACK_SIZE	= _RSP      + _RSP_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) # rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) # Rotate values of symbols X0...X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) .macro rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) 	X_ = X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) 	X0 = X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) 	X1 = X2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) 	X2 = X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) 	X3 = X_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) # ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) # Rotate values of symbols a...h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) .macro ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) 	old_h = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) 	TMP_ = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) 	h = g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) 	g = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) 	f = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 	e = d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 	d = c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 	c = b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) 	b = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 	a = TMP_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) .macro FOUR_ROUNDS_AND_SCHED disp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) ################################### RND N + 0 ############################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) 	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) 	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) 	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) 	vpsrld	$7, XTMP1, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) 	vpslld	$(32-7), XTMP1, XTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 	vpsrld	$18, XTMP1, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 	add	y3, h		# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) ################################### RND N + 1 ############################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 	offset = \disp + 1*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) 	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) 	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) 	vpslld	$(32-18), XTMP1, XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) 	vpxor	XTMP1, XTMP3, XTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) 	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) 	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) 	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) 	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) 	add	y3, h		# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) 	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) ################################### RND N + 2 ############################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) 	offset = \disp + 2*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) 	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) 	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) 	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) 	vpxor	XTMP3, XTMP2, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) 	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) 	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) 	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) 	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) 	add	y1,h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) 	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) 	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) 	add	y3,h		# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) ################################### RND N + 3 ############################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) 	offset = \disp + 3*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) 	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) 	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) 	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) 	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) 	vpxor	XTMP3, XTMP2, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) 	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) 	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) 	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) 	add	y3, h		# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) 	rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) .macro DO_4ROUNDS disp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) ################################### RND N + 0 ###########################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) 	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) ################################### RND N + 1 ###########################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) 	offset = 4*1 + \disp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) 	addl	offset(%rsp, SRND), h		# h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) ################################### RND N + 2 ##############################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) 	offset = 4*2 + \disp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) 	addl	offset(%rsp, SRND), h		# h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) ################################### RND N + 3 ###########################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) 	rorx	$25, e, y0	# y0 = e >> 25				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) 	rorx	$11, e, y1	# y1 = e >> 11				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) 	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) 	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) 	rorx	$13, a, T1	# T1 = a >> 13				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) 	rorx	$22, a, y1	# y1 = a >> 22				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) 	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) 	offset = 4*3 + \disp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) 	addl	offset(%rsp, SRND), h		# h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) 	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) 	add	y3, h		# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) ## arg 1 : pointer to state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) ## arg 2 : pointer to input data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) ## arg 3 : Num blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) SYM_FUNC_START(sha256_transform_rorx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) 	pushq	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) 	pushq	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) 	pushq	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) 	pushq	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) 	pushq	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) 	mov	%rsp, %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) 	subq	$STACK_SIZE, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) 	and	$-32, %rsp	# align rsp to 32 byte boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) 	mov	%rax, _RSP(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) 	shl	$6, NUM_BLKS	# convert to bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) 	jz	done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) 	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) 	mov	NUM_BLKS, _INP_END(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) 	cmp	NUM_BLKS, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) 	je	only_one_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) 	## load initial digest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) 	mov	(CTX), a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) 	mov	4*1(CTX), b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) 	mov	4*2(CTX), c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) 	mov	4*3(CTX), d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) 	mov	4*4(CTX), e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) 	mov	4*5(CTX), f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) 	mov	4*6(CTX), g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) 	mov	4*7(CTX), h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) 	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) 	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) 	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) 	mov	CTX, _CTX(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) loop0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) 	## Load first 16 dwords from two blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) 	VMOVDQ	0*32(INP),XTMP0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) 	VMOVDQ	1*32(INP),XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) 	VMOVDQ	2*32(INP),XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) 	VMOVDQ	3*32(INP),XTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) 	## byte swap data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) 	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) 	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) 	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) 	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) 	## transpose data into high/low halves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) 	vperm2i128	$0x20, XTMP2, XTMP0, X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) 	vperm2i128	$0x31, XTMP2, XTMP0, X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) 	vperm2i128	$0x20, XTMP3, XTMP1, X2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) 	vperm2i128	$0x31, XTMP3, XTMP1, X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) last_block_enter:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) 	add	$64, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) 	mov	INP, _INP(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) 	## schedule 48 input dwords, by doing 3 rounds of 12 each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) 	xor	SRND, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) loop1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) 	vpaddd	K256+0*32(SRND), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) 	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) 	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) 	vpaddd	K256+1*32(SRND), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) 	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) 	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) 	vpaddd	K256+2*32(SRND), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) 	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) 	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) 	vpaddd	K256+3*32(SRND), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) 	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) 	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) 	add	$4*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) 	cmp	$3*4*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) 	jb	loop1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) loop2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) 	## Do last 16 rounds with no scheduling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) 	vpaddd	K256+0*32(SRND), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) 	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) 	DO_4ROUNDS	_XFER + 0*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) 	vpaddd	K256+1*32(SRND), X1, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) 	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) 	DO_4ROUNDS	_XFER + 1*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) 	add	$2*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) 	vmovdqa	X2, X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) 	vmovdqa	X3, X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) 	cmp	$4*4*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) 	jb	loop2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) 	mov	_CTX(%rsp), CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) 	mov	_INP(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) 	addm    (4*0)(CTX),a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) 	addm    (4*1)(CTX),b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) 	addm    (4*2)(CTX),c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) 	addm    (4*3)(CTX),d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) 	addm    (4*4)(CTX),e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) 	addm    (4*5)(CTX),f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) 	addm    (4*6)(CTX),g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) 	addm    (4*7)(CTX),h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) 	cmp	_INP_END(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) 	ja	done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) 	#### Do second block using previously scheduled results
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) 	xor	SRND, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) loop3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) 	DO_4ROUNDS	 _XFER + 0*32 + 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) 	DO_4ROUNDS	 _XFER + 1*32 + 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) 	add	$2*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) 	cmp	$4*4*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) 	jb	loop3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) 	mov	_CTX(%rsp), CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) 	mov	_INP(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) 	add	$64, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) 	addm    (4*0)(CTX),a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) 	addm    (4*1)(CTX),b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) 	addm    (4*2)(CTX),c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) 	addm    (4*3)(CTX),d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) 	addm    (4*4)(CTX),e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) 	addm    (4*5)(CTX),f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) 	addm    (4*6)(CTX),g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) 	addm    (4*7)(CTX),h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) 	cmp	_INP_END(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) 	jb	loop0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) 	ja	done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) do_last_block:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) 	VMOVDQ	0*16(INP),XWORD0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) 	VMOVDQ	1*16(INP),XWORD1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) 	VMOVDQ	2*16(INP),XWORD2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) 	VMOVDQ	3*16(INP),XWORD3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) 	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) 	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) 	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) 	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) 	jmp	last_block_enter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) only_one_block:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) 	## load initial digest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) 	mov	(4*0)(CTX),a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) 	mov	(4*1)(CTX),b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) 	mov	(4*2)(CTX),c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) 	mov	(4*3)(CTX),d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) 	mov	(4*4)(CTX),e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) 	mov	(4*5)(CTX),f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) 	mov	(4*6)(CTX),g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) 	mov	(4*7)(CTX),h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) 	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) 	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) 	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) 	mov	CTX, _CTX(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) 	jmp	do_last_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) done_hash:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) 	mov	_RSP(%rsp), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) 	popq	%r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) 	popq	%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) 	popq	%r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) 	popq	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) 	popq	%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) SYM_FUNC_END(sha256_transform_rorx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) .section	.rodata.cst512.K256, "aM", @progbits, 512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) .align 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) K256:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) .section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) PSHUFFLE_BYTE_FLIP_MASK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) 	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) .section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) _SHUF_00BA:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) .section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) _SHUF_DC00:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF