Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) # Implement fast SHA-256 with AVX1 instructions. (x86_64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4) # Copyright (C) 2013 Intel Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6) # Authors:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7) #     James Guilford <james.guilford@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8) #     Kirk Yap <kirk.s.yap@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9) #     Tim Chen <tim.c.chen@linux.intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11) # This software is available to you under a choice of one of two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12) # licenses.  You may choose to be licensed under the terms of the GNU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13) # General Public License (GPL) Version 2, available from the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14) # COPYING in the main directory of this source tree, or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15) # OpenIB.org BSD license below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17) #     Redistribution and use in source and binary forms, with or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18) #     without modification, are permitted provided that the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) #     conditions are met:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) #      - Redistributions of source code must retain the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) #        copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) #        disclaimer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) #      - Redistributions in binary form must reproduce the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) #        copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) #        disclaimer in the documentation and/or other materials
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) #        provided with the distribution.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) # SOFTWARE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) # This code is described in an Intel White-Paper:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) # "Fast SHA-256 Implementations on Intel Architecture Processors"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) # To find it, surf to http://www.intel.com/p/en_US/embedded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) # and search for that title.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) # This code schedules 1 block at a time, with 4 lanes per block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) ## assume buffers not aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) #define    VMOVDQ vmovdqu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) ################################ Define Macros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) # addm [mem], reg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) # Add reg to mem using reg-mem add and store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) .macro addm p1 p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) 	add     \p1, \p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) 	mov     \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) .macro MY_ROR p1 p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) 	shld    $(32-(\p1)), \p2, \p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69) ################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) # Load xmm with mem and byte swap each dword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) .macro COPY_XMM_AND_BSWAP p1 p2 p3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) 	VMOVDQ \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) 	vpshufb \p3, \p1, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) ################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) X0 = %xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) X1 = %xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) X2 = %xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) X3 = %xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) XTMP0 = %xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) XTMP1 = %xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) XTMP2 = %xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) XTMP3 = %xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) XTMP4 = %xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) XFER = %xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) XTMP5 = %xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) BYTE_FLIP_MASK = %xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) NUM_BLKS = %rdx   # 3rd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) INP = %rsi        # 2nd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) CTX = %rdi        # 1st arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) SRND = %rsi       # clobbers INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) c = %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) d = %r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) e = %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) TBL = %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) a = %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) b = %ebx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) f = %r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) g = %r10d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) h = %r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) y0 = %r13d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) y1 = %r14d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) y2 = %r15d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) _INP_END_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) _INP_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) _XFER_SIZE = 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) _XMM_SAVE_SIZE = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) _INP_END = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) _INP            = _INP_END  + _INP_END_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) _XFER           = _INP      + _INP_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) _XMM_SAVE       = _XFER     + _XFER_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) # rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) # Rotate values of symbols X0...X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) .macro rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) X_ = X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) X0 = X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) X1 = X2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) X2 = X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) X3 = X_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) # ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) # Rotate values of symbols a...h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) .macro ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) TMP_ = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) h = g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) g = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) f = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) e = d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) d = c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) c = b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) b = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) a = TMP_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) .macro FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 	## compute s0 four at a time and s1 two at a time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) 	## compute W[-16] + W[-7] 4 at a time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) 	mov     e, y0			# y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) 	mov     a, y1                   # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) 	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 	mov     f, y2                   # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) 	xor     g, y2                   # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) 	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) 	and     e, y2                   # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 	## compute s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) 	add     y0, y2                  # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) 	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 	mov     a, y0                   # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) 	add     y2, h                   # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) 	mov     a, y2                   # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 	vpsrld  $7, XTMP1, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 	or      c, y0                   # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) 	add     h, d                    # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) 	and     c, y2                   # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 	vpslld  $(32-7), XTMP1, XTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) 	and     b, y0                   # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) 	add     y1, h                   # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) 	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 	mov     e, y0                   # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 	mov     a, y1                   # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 	mov     f, y2                   # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 	vpsrld  $18, XTMP1, XTMP2       #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) 	xor     g, y2                   # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) 	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 	and     e, y2                   # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 	vpslld  $(32-18), XTMP1, XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) 	vpxor   XTMP1, XTMP3, XTMP3     #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) 	add     y0, y2                  # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) 	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) 	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) 	mov     a, y0                   # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) 	add     y2, h                   # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) 	mov     a, y2                   # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) 	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) 	or      c, y0                   # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) 	add     h, d                    # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) 	and     c, y2                   # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) 	## compute low s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) 	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) 	and     b, y0                   # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) 	add     y1, h                   # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) 	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) 	mov     e, y0                   # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 	mov     a, y1                   # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) 	mov     f, y2                   # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) 	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) 	xor     g, y2                   # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) 	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) 	and     e, y2                   # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) 	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) 	vpxor   XTMP3, XTMP2, XTMP2     #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) 	add     y0, y2                  # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) 	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) 	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) 	mov     a, y0                   # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) 	add     y2, h                   # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) 	mov     a, y2                   # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) 	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) 	or      c, y0                   # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) 	add     h, d                    # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) 	and     c, y2                   # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) 	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) 	and     b, y0                   # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) 	add     y1, h                   # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) 	## compute high s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) 	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) 	mov     e, y0                   # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) 	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) 	mov     a, y1                   # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) 	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) 	xor     e, y0                   # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) 	mov     f, y2                   # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) 	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) 	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) 	xor     a, y1                   # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) 	xor     g, y2                   # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) 	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) 	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) 	and     e, y2                   # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) 	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) 	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) 	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) 	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) 	vpxor   XTMP3, XTMP2, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) 	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) 	add     y0, y2                  # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) 	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) 	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 	mov     a, y0                   # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) 	add     y2, h                   # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 	mov     a, y2                   # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) 	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) 	or      c, y0                   # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) 	add     h, d                    # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) 	and     c, y2                   # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) 	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) 	and     b, y0                   # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) 	add     y1, h                   # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) 	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) 	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) 	ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) 	rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) ## input is [rsp + _XFER + %1 * 4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) .macro DO_ROUND round
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) 	mov	e, y0			# y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311)         MY_ROR  (25-11), y0             # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312)         mov     a, y1                   # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313)         xor     e, y0                   # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314)         MY_ROR  (22-13), y1             # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315)         mov     f, y2                   # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)         xor     a, y1                   # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317)         MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)         xor     g, y2                   # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)         xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320)         MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321)         and     e, y2                   # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322)         xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)         MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324)         xor     g, y2                   # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)         add     y0, y2                  # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326)         MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327)         offset = \round * 4 + _XFER     #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328)         add     offset(%rsp), y2	# y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329)         mov     a, y0			# y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330)         add     y2, h                   # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)         mov     a, y2                   # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332)         or      c, y0                   # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333)         add     h, d                    # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)         and     c, y2                   # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)         and     b, y0                   # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336)         add     y1, h                   # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)         or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338)         add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)         ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) ## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) ## arg 1 : pointer to state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) ## arg 2 : pointer to input data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) ## arg 3 : Num blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) SYM_FUNC_START(sha256_transform_avx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) 	pushq   %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) 	pushq   %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) 	pushq   %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) 	pushq   %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) 	pushq   %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) 	pushq	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) 	movq	%rsp, %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) 	subq    $STACK_SIZE, %rsp	# allocate stack space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) 	and	$~15, %rsp		# align stack pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) 	shl     $6, NUM_BLKS		# convert to bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) 	jz      done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) 	add     INP, NUM_BLKS		# pointer to end of data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) 	mov     NUM_BLKS, _INP_END(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) 	## load initial digest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) 	mov     4*0(CTX), a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) 	mov     4*1(CTX), b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) 	mov     4*2(CTX), c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) 	mov     4*3(CTX), d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) 	mov     4*4(CTX), e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) 	mov     4*5(CTX), f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) 	mov     4*6(CTX), g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) 	mov     4*7(CTX), h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) 	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) 	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) 	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) loop0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) 	lea     K256(%rip), TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) 	## byte swap first 16 dwords
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) 	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) 	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) 	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) 	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) 	mov     INP, _INP(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) 	## schedule 48 input dwords, by doing 3 rounds of 16 each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) 	mov     $3, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) loop1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) 	vpaddd  (TBL), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) 	vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) 	vpaddd  1*16(TBL), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) 	vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) 	vpaddd  2*16(TBL), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) 	vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) 	vpaddd  3*16(TBL), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) 	vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) 	add	$4*16, TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) 	FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) 	sub     $1, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) 	jne     loop1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) 	mov     $2, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) loop2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) 	vpaddd  (TBL), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) 	vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) 	DO_ROUND        0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) 	DO_ROUND        1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) 	DO_ROUND        2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) 	DO_ROUND        3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) 	vpaddd  1*16(TBL), X1, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) 	vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) 	add     $2*16, TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) 	DO_ROUND        0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) 	DO_ROUND        1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) 	DO_ROUND        2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) 	DO_ROUND        3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) 	vmovdqa X2, X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) 	vmovdqa X3, X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) 	sub     $1, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) 	jne     loop2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) 	addm    (4*0)(CTX),a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) 	addm    (4*1)(CTX),b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) 	addm    (4*2)(CTX),c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) 	addm    (4*3)(CTX),d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) 	addm    (4*4)(CTX),e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) 	addm    (4*5)(CTX),f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) 	addm    (4*6)(CTX),g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) 	addm    (4*7)(CTX),h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) 	mov     _INP(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) 	add     $64, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) 	cmp     _INP_END(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) 	jne     loop0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) done_hash:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) 	mov	%rbp, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) 	popq	%rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) 	popq    %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) 	popq    %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) 	popq    %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) 	popq	%r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 	popq    %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) SYM_FUNC_END(sha256_transform_avx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) .section	.rodata.cst256.K256, "aM", @progbits, 256
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) .align 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) K256:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) 	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) 	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) 	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) 	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) 	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) 	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) 	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) 	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) 	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) 	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) 	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) 	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) 	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) 	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) 	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) PSHUFFLE_BYTE_FLIP_MASK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) 	.octa 0x0c0d0e0f08090a0b0405060700010203
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) .section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) _SHUF_00BA:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) .section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) _SHUF_DC00:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF