Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) # Implement fast SHA-256 with SSSE3 instructions. (x86_64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4) # Copyright (C) 2013 Intel Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6) # Authors:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7) #     James Guilford <james.guilford@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8) #     Kirk Yap <kirk.s.yap@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9) #     Tim Chen <tim.c.chen@linux.intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11) # This software is available to you under a choice of one of two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12) # licenses.  You may choose to be licensed under the terms of the GNU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13) # General Public License (GPL) Version 2, available from the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14) # COPYING in the main directory of this source tree, or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15) # OpenIB.org BSD license below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17) #     Redistribution and use in source and binary forms, with or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18) #     without modification, are permitted provided that the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) #     conditions are met:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) #      - Redistributions of source code must retain the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) #        copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) #        disclaimer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) #      - Redistributions in binary form must reproduce the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) #        copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) #        disclaimer in the documentation and/or other materials
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) #        provided with the distribution.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) # SOFTWARE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) # This code is described in an Intel White-Paper:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) # "Fast SHA-256 Implementations on Intel Architecture Processors"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) # To find it, surf to http://www.intel.com/p/en_US/embedded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) # and search for that title.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) ## assume buffers not aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) #define    MOVDQ movdqu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) ################################ Define Macros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) # addm [mem], reg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) # Add reg to mem using reg-mem add and store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) .macro addm p1 p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59)         add     \p1, \p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60)         mov     \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) ################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) # Load xmm with mem and byte swap each dword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) .macro COPY_XMM_AND_BSWAP p1 p2 p3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68)         MOVDQ \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69)         pshufb \p3, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) ################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) X0 = %xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) X1 = %xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) X2 = %xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) X3 = %xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) XTMP0 = %xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) XTMP1 = %xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) XTMP2 = %xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) XTMP3 = %xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) XTMP4 = %xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) XFER = %xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) BYTE_FLIP_MASK = %xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) NUM_BLKS = %rdx   # 3rd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) INP = %rsi        # 2nd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) CTX = %rdi        # 1st arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) SRND = %rsi       # clobbers INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) c = %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) d = %r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) e = %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) TBL = %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) a = %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) b = %ebx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) f = %r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) g = %r10d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) h = %r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) y0 = %r13d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) y1 = %r14d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) y2 = %r15d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) _INP_END_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) _INP_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) _XFER_SIZE = 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) _XMM_SAVE_SIZE = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) _INP_END = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) _INP            = _INP_END  + _INP_END_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) _XFER           = _INP      + _INP_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) _XMM_SAVE       = _XFER     + _XFER_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) # rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) # Rotate values of symbols X0...X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) .macro rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) X_ = X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) X0 = X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) X1 = X2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) X2 = X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) X3 = X_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) # ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) # Rotate values of symbols a...h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) .macro ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) TMP_ = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) h = g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) g = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) f = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) e = d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) d = c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) c = b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) b = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) a = TMP_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) .macro FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 	## compute s0 four at a time and s1 two at a time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 	## compute W[-16] + W[-7] 4 at a time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 	movdqa  X3, XTMP0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) 	mov     e, y0			# y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 	ror     $(25-11), y0            # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) 	mov     a, y1                   # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 	palignr $4, X2, XTMP0           # XTMP0 = W[-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) 	ror     $(22-13), y1            # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) 	mov     f, y2                   # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) 	movdqa  X1, XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) 	xor     g, y2                   # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) 	paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) 	and     e, y2                   # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) 	## compute s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) 	palignr $4, X0, XTMP1           # XTMP1 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 	movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) 	add     y0, y2                  # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) 	add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 	movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) 	mov     a, y0                   # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) 	add     y2, h                   # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) 	mov     a, y2                   # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 	pslld   $(32-7), XTMP1          #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) 	or      c, y0                   # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) 	add     h, d                    # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 	and     c, y2                   # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 	psrld   $7, XTMP2               #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) 	and     b, y0                   # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) 	add     y1, h                   # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 	por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) 					#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) 	ROTATE_ARGS                     #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) 	movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 	mov     e, y0                   # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 	mov     a, y1                   # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 	movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 	ror     $(25-11), y0            # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 	mov     f, y2                   # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) 	ror     $(22-13), y1            # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 	pslld   $(32-18), XTMP3         #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) 	xor     g, y2                   # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) 	psrld   $18, XTMP2              #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 	and     e, y2                   # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 	pxor    XTMP3, XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) 	psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) 	add     y0, y2                  # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) 	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) 	pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) 	mov     a, y0                   # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) 	add     y2, h                   # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) 	mov     a, y2                   # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) 	pxor    XTMP4, XTMP1            # XTMP1 = s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) 	or      c, y0                   # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) 	add     h, d                    # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) 	and     c, y2                   # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) 	## compute low s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) 	pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) 	and     b, y0			# y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) 	add     y1, h                   # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) 	paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) 	mov     e, y0                   # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) 	mov     a, y1                   # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) 	ror     $(25-11), y0            # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) 	movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) 	ror     $(22-13), y1            # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) 	mov     f, y2                   # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) 	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) 	xor     g, y2                   # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) 	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) 	and     e, y2                   # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) 	psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) 	pxor    XTMP3, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) 	add     y0, y2                  # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) 	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) 	pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) 	mov     a, y0                   # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) 	add     y2, h                   # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) 	mov     a, y2                   # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) 	pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) 	or      c, y0                   # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) 	add     h, d                    # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) 	and     c, y2                   # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) 	paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) 	and     b, y0                   # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) 	add     y1, h                   # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) 	## compute high s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) 	pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) 					#
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) 	ROTATE_ARGS                     #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) 	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) 	mov     e, y0                   # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) 	ror     $(25-11), y0            # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) 	mov     a, y1                   # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) 	movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) 	ror     $(22-13), y1            # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) 	mov     f, y2                   # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) 	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) 	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) 	xor     g, y2                   # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) 	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) 	and     e, y2                   # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) 	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) 	psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) 	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) 	pxor    XTMP3, XTMP2            #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) 	add     y0, y2                  # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) 	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) 	pxor    XTMP2, X0               # X0 = s1 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) 	mov     a, y0                   # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) 	add     y2, h                   # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) 	mov     a, y2                   # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) 	pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) 	or      c, y0                   # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) 	add     h, d                    # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) 	and     c, y2                   # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) 	paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) 	and     b, y0                   # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 	add     y1, h                   # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) 	rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) ## input is [rsp + _XFER + %1 * 4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) .macro DO_ROUND round
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) 	mov     e, y0                 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) 	ror     $(25-11), y0          # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) 	mov     a, y1                 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) 	xor     e, y0                 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) 	ror     $(22-13), y1          # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) 	mov     f, y2                 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) 	xor     a, y1                 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) 	ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) 	xor     g, y2                 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) 	xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) 	ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) 	and     e, y2                 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) 	xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) 	ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) 	xor     g, y2                 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) 	add     y0, y2                # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) 	ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) 	offset = \round * 4 + _XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) 	add     offset(%rsp), y2      # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) 	mov     a, y0                 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) 	add     y2, h                 # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) 	mov     a, y2                 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) 	or      c, y0                 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) 	add     h, d                  # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) 	and     c, y2                 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) 	and     b, y0                 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) 	add     y1, h                 # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) 	or      y2, y0		      # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) 	add     y0, h		      # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) ## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) ##			       int blocks);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) ## arg 1 : pointer to state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) ##	   (struct sha256_state is assumed to begin with u32 state[8])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) ## arg 2 : pointer to input data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) ## arg 3 : Num blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) SYM_FUNC_START(sha256_transform_ssse3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) 	pushq   %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) 	pushq   %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) 	pushq   %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) 	pushq   %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) 	pushq   %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) 	pushq   %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) 	mov	%rsp, %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) 	subq    $STACK_SIZE, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) 	and	$~15, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) 	shl     $6, NUM_BLKS		 # convert to bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) 	jz      done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) 	add     INP, NUM_BLKS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) 	mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 	## load initial digest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) 	mov     4*0(CTX), a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) 	mov     4*1(CTX), b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) 	mov     4*2(CTX), c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) 	mov     4*3(CTX), d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) 	mov     4*4(CTX), e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) 	mov     4*5(CTX), f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) 	mov     4*6(CTX), g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) 	mov     4*7(CTX), h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) 	movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) 	movdqa  _SHUF_00BA(%rip), SHUF_00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) 	movdqa  _SHUF_DC00(%rip), SHUF_DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) loop0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) 	lea     K256(%rip), TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) 	## byte swap first 16 dwords
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) 	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) 	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) 	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) 	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) 	mov     INP, _INP(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) 	## schedule 48 input dwords, by doing 3 rounds of 16 each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) 	mov     $3, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) loop1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) 	movdqa  (TBL), XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) 	paddd   X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) 	movdqa  XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) 	movdqa  1*16(TBL), XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) 	paddd   X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) 	movdqa  XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) 	movdqa  2*16(TBL), XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) 	paddd   X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) 	movdqa  XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) 	movdqa  3*16(TBL), XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) 	paddd   X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) 	movdqa  XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) 	add     $4*16, TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) 	sub     $1, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) 	jne     loop1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) 	mov     $2, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) loop2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) 	paddd   (TBL), X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) 	movdqa  X0, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) 	DO_ROUND        0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) 	DO_ROUND        1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) 	DO_ROUND        2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) 	DO_ROUND        3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) 	paddd   1*16(TBL), X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) 	movdqa  X1, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) 	add     $2*16, TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) 	DO_ROUND        0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) 	DO_ROUND        1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) 	DO_ROUND        2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) 	DO_ROUND        3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) 	movdqa  X2, X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) 	movdqa  X3, X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) 	sub     $1, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) 	jne     loop2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) 	addm    (4*0)(CTX),a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) 	addm    (4*1)(CTX),b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) 	addm    (4*2)(CTX),c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) 	addm    (4*3)(CTX),d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) 	addm    (4*4)(CTX),e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) 	addm    (4*5)(CTX),f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) 	addm    (4*6)(CTX),g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) 	addm    (4*7)(CTX),h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 	mov     _INP(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) 	add     $64, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) 	cmp     _INP_END(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) 	jne     loop0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) done_hash:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) 	mov	%rbp, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) 	popq	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) 	popq    %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 	popq    %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) 	popq    %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) 	popq    %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) 	popq    %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) SYM_FUNC_END(sha256_transform_ssse3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) .section	.rodata.cst256.K256, "aM", @progbits, 256
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) .align 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) K256:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481)         .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482)         .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483)         .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484)         .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485)         .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486)         .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487)         .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)         .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489)         .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490)         .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491)         .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492)         .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493)         .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494)         .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495)         .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496)         .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) PSHUFFLE_BYTE_FLIP_MASK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) 	.octa 0x0c0d0e0f08090a0b0405060700010203
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) .section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) _SHUF_00BA:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) .section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) _SHUF_DC00:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF