Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) # Implement fast SHA-512 with AVX2 instructions. (x86_64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4) # Copyright (C) 2013 Intel Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6) # Authors:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7) #     James Guilford <james.guilford@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8) #     Kirk Yap <kirk.s.yap@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9) #     David Cote <david.m.cote@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10) #     Tim Chen <tim.c.chen@linux.intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12) # This software is available to you under a choice of one of two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13) # licenses.  You may choose to be licensed under the terms of the GNU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14) # General Public License (GPL) Version 2, available from the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15) # COPYING in the main directory of this source tree, or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16) # OpenIB.org BSD license below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18) #     Redistribution and use in source and binary forms, with or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) #     without modification, are permitted provided that the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) #     conditions are met:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) #      - Redistributions of source code must retain the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) #        copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) #        disclaimer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) #      - Redistributions in binary form must reproduce the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) #        copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) #        disclaimer in the documentation and/or other materials
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) #        provided with the distribution.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) # SOFTWARE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) # This code is described in an Intel White-Paper:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) # "Fast SHA-512 Implementations on Intel Architecture Processors"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) # To find it, surf to http://www.intel.com/p/en_US/embedded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) # and search for that title.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) # This code schedules 1 blocks at a time, with 4 lanes per block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) # Virtual Registers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) Y_0 = %ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) Y_1 = %ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) Y_2 = %ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) Y_3 = %ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) YTMP0 = %ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) YTMP1 = %ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) YTMP2 = %ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) YTMP3 = %ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) YTMP4 = %ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) XFER  = YTMP0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69) BYTE_FLIP_MASK  = %ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) # 1st arg is %rdi, which is saved to the stack and accessed later via %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) CTX1        = %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) CTX2        = %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) # 2nd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) INP         = %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) # 3rd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) NUM_BLKS    = %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) c           = %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) d           = %r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) e           = %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) y3          = %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) TBL   = %rdi # clobbers CTX1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) a     = %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) b     = %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) f     = %r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) g     = %r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) h     = %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) old_h = %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) T1    = %r12 # clobbers CTX2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) y0    = %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) y1    = %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) y2    = %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) # Local variables (stack frame)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) XFER_SIZE = 4*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) SRND_SIZE = 1*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) INP_SIZE = 1*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) INPEND_SIZE = 1*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) CTX_SIZE = 1*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) RSPSAVE_SIZE = 1*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) GPRSAVE_SIZE = 5*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) frame_XFER = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) frame_SRND = frame_XFER + XFER_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) frame_INP = frame_SRND + SRND_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) frame_INPEND = frame_INP + INP_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) frame_CTX = frame_INPEND + INPEND_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) frame_RSPSAVE = frame_CTX + CTX_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) frame_size = frame_GPRSAVE + GPRSAVE_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) ## assume buffers not aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) #define	VMOVDQ vmovdqu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) # addm [mem], reg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) # Add reg to mem using reg-mem add and store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) .macro addm p1 p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) 	add	\p1, \p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) 	mov	\p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) # COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) # Load ymm with mem and byte swap each dword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) .macro COPY_YMM_AND_BSWAP p1 p2 p3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) 	VMOVDQ \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) 	vpshufb \p3, \p1, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) # rotate_Ys
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) # Rotate values of symbols Y0...Y3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) .macro rotate_Ys
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) 	Y_ = Y_0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) 	Y_0 = Y_1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) 	Y_1 = Y_2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) 	Y_2 = Y_3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) 	Y_3 = Y_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) # RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) .macro RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) 	# Rotate symbols a..h right
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) 	old_h  = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 	TMP_   = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 	h      = g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 	g      = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) 	f      = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 	e      = d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) 	d      = c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 	c      = b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) 	b      = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 	a      = TMP_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) # macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) # YDST = {YSRC1, YSRC2} >> RVAL*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) .macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) 	vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 	vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) .macro FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) ################################### RND N + 0 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) 	# Extract w[t-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) 	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 	# Calculate w[t-16] + w[t-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) 	# Extract w[t-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) 	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) 	# Calculate sigma0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) 	# Calculate w[t-15] ror 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 	vpsrlq		$1, YTMP1, YTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) 	vpsllq		$(64-1), YTMP1, YTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) 	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 	# Calculate w[t-15] shr 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) 	add	frame_XFER(%rsp),h		# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) 	add	y3, h		# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) 	RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) ################################### RND N + 1 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) 	# Calculate w[t-15] ror 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) 	vpsrlq		$8, YTMP1, YTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) 	vpsllq		$(64-8), YTMP1, YTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) 	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) 	# XOR the three components
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) 	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) 	vpxor		YTMP1, YTMP3, YTMP1		# YTMP1 = s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) 	# Add three components, w[t-16], w[t-7] and sigma0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) 	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) 	# Move to appropriate lanes for calculating w[16] and w[17]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) 	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) 	# Move to appropriate lanes for calculating w[18] and w[19]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) 	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) 	# Calculate w[16] and w[17] in both 128 bit lanes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) 	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) 	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) 	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) 	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) 	add	y3, h		# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) 	RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) ################################### RND N + 2 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) 	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) 	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) 	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) 	# Add sigma1 to the other compunents to get w[16] and w[17]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) 	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) 	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) 	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) 	add	y3, h		# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) 	RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) ################################### RND N + 3 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) 	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) 	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) 	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) 	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) 	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) 	# to newly calculated sigma1 to get w[18] and w[19]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) 	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) 	# Form w[19, w[18], w17], w[16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) 	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) 	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) 	add	y3, h		# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) 	RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) 	rotate_Ys
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) .macro DO_4ROUNDS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) ################################### RND N + 0 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) 	add	frame_XFER(%rsp), h		# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) 	RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) ################################### RND N + 1 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) 	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) 	RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) ################################### RND N + 2 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) 	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) 	RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) ################################### RND N + 3 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) 	mov	f, y2		# y2 = f                                # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) 	rorx	$41, e, y0	# y0 = e >> 41				# S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) 	rorx	$18, e, y1	# y1 = e >> 18				# S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) 	xor	g, y2		# y2 = f^g                              # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) 	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) 	and	e, y2		# y2 = (f^g)&e                          # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) 	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) 	rorx	$34, a, T1	# T1 = a >> 34				# S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) 	rorx	$39, a, y1	# y1 = a >> 39				# S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) 	mov	a, y3		# y3 = a                                # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) 	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) 	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) 	or	c, y3		# y3 = a|c                              # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) 	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) 	mov	a, T1		# T1 = a                                # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) 	and	b, y3		# y3 = (a|c)&b                          # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) 	and	c, T1		# T1 = a&c                              # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) 	add	y0, y2		# y2 = S1 + CH                          # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) 	add	h, d		# d = k + w + h + d                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) 	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) 	add	y1, h		# h = k + w + h + S0                    # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) 	add	y3, h		# h = t1 + S0 + MAJ                     # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) 	RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) # void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) # Purpose: Updates the SHA512 digest stored at "state" with the message
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) # stored in "data".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) # The size of the message pointed to by "data" must be an integer multiple
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) # of SHA512 message blocks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) # "blocks" is the message length in SHA512 blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) SYM_FUNC_START(sha512_transform_rorx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) 	# Allocate Stack Space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) 	mov	%rsp, %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) 	sub	$frame_size, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) 	and	$~(0x20 - 1), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) 	mov	%rax, frame_RSPSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) 	# Save GPRs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) 	mov	%rbx, 8*0+frame_GPRSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) 	mov	%r12, 8*1+frame_GPRSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) 	mov	%r13, 8*2+frame_GPRSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) 	mov	%r14, 8*3+frame_GPRSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) 	mov	%r15, 8*4+frame_GPRSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) 	shl	$7, NUM_BLKS	# convert to bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) 	jz	done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) 	add	INP, NUM_BLKS	# pointer to end of data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) 	mov	NUM_BLKS, frame_INPEND(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) 	## load initial digest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) 	mov	8*0(CTX1), a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) 	mov	8*1(CTX1), b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) 	mov	8*2(CTX1), c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) 	mov	8*3(CTX1), d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) 	mov	8*4(CTX1), e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) 	mov	8*5(CTX1), f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) 	mov	8*6(CTX1), g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) 	mov	8*7(CTX1), h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) 	# save %rdi (CTX) before it gets clobbered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) 	mov	%rdi, frame_CTX(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) 	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) loop0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) 	lea	K512(%rip), TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) 	## byte swap first 16 dwords
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) 	COPY_YMM_AND_BSWAP	Y_0, (INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) 	COPY_YMM_AND_BSWAP	Y_1, 1*32(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) 	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) 	COPY_YMM_AND_BSWAP	Y_3, 3*32(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) 	mov	INP, frame_INP(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) 	## schedule 64 input dwords, by doing 12 rounds of 4 each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) 	movq	$4, frame_SRND(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) loop1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) 	vpaddq	(TBL), Y_0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) 	vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) 	vpaddq	1*32(TBL), Y_0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) 	vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) 	vpaddq	2*32(TBL), Y_0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) 	vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) 	vpaddq	3*32(TBL), Y_0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) 	vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) 	add	$(4*32), TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) 	subq	$1, frame_SRND(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) 	jne	loop1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) 	movq	$2, frame_SRND(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) loop2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) 	vpaddq	(TBL), Y_0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) 	vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) 	DO_4ROUNDS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) 	vpaddq	1*32(TBL), Y_1, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) 	vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) 	add	$(2*32), TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) 	DO_4ROUNDS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) 	vmovdqa	Y_2, Y_0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) 	vmovdqa	Y_3, Y_1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) 	subq	$1, frame_SRND(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) 	jne	loop2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) 	mov	frame_CTX(%rsp), CTX2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) 	addm	8*0(CTX2), a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) 	addm	8*1(CTX2), b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) 	addm	8*2(CTX2), c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) 	addm	8*3(CTX2), d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) 	addm	8*4(CTX2), e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) 	addm	8*5(CTX2), f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) 	addm	8*6(CTX2), g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) 	addm	8*7(CTX2), h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) 	mov	frame_INP(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) 	add	$128, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) 	cmp	frame_INPEND(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) 	jne	loop0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) done_hash:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) # Restore GPRs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) 	mov	8*0+frame_GPRSAVE(%rsp), %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) 	mov	8*1+frame_GPRSAVE(%rsp), %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) 	mov	8*2+frame_GPRSAVE(%rsp), %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) 	mov	8*3+frame_GPRSAVE(%rsp), %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) 	mov	8*4+frame_GPRSAVE(%rsp), %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) 	# Restore Stack Pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) 	mov	frame_RSPSAVE(%rsp), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) SYM_FUNC_END(sha512_transform_rorx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) ### Binary Data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) # Mergeable 640-byte rodata section. This allows linker to merge the table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) # with other, exactly the same 640-byte fragment of another rodata section
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) # (if such section exists).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) .section	.rodata.cst640.K512, "aM", @progbits, 640
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) .align 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) # K[t] used in SHA512 hashing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) K512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) 	.quad	0xd192e819d6ef5218,0xd69906245565a910
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) 	.quad	0x28db77f523047d84,0x32caab7b40c72493
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) .section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) PSHUFFLE_BYTE_FLIP_MASK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) 	.octa 0x08090a0b0c0d0e0f0001020304050607
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) 	.octa 0x18191a1b1c1d1e1f1011121314151617
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) .section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) MASK_YMM_LO:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) 	.octa 0x00000000000000000000000000000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) 	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF