^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) # Implement fast SHA-256 with SSSE3 instructions. (x86_64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) # Copyright (C) 2013 Intel Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) # Authors:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) # James Guilford <james.guilford@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) # Kirk Yap <kirk.s.yap@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) # Tim Chen <tim.c.chen@linux.intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) # This software is available to you under a choice of one of two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) # licenses. You may choose to be licensed under the terms of the GNU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) # General Public License (GPL) Version 2, available from the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) # COPYING in the main directory of this source tree, or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) # OpenIB.org BSD license below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) # Redistribution and use in source and binary forms, with or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) # without modification, are permitted provided that the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) # conditions are met:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) # - Redistributions of source code must retain the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) # copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) # disclaimer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) # - Redistributions in binary form must reproduce the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) # copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) # disclaimer in the documentation and/or other materials
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) # provided with the distribution.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) # SOFTWARE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) # This code is described in an Intel White-Paper:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) # "Fast SHA-256 Implementations on Intel Architecture Processors"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) # To find it, surf to http://www.intel.com/p/en_US/embedded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) # and search for that title.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) ## assume buffers not aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #define MOVDQ movdqu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) ################################ Define Macros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) # addm [mem], reg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) # Add reg to mem using reg-mem add and store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) .macro addm p1 p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) add \p1, \p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) mov \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) ################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) # Load xmm with mem and byte swap each dword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) .macro COPY_XMM_AND_BSWAP p1 p2 p3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) MOVDQ \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) pshufb \p3, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) ################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) X0 = %xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) X1 = %xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) X2 = %xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) X3 = %xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) XTMP0 = %xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) XTMP1 = %xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) XTMP2 = %xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) XTMP3 = %xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) XTMP4 = %xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) XFER = %xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) BYTE_FLIP_MASK = %xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) NUM_BLKS = %rdx # 3rd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) INP = %rsi # 2nd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) CTX = %rdi # 1st arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) SRND = %rsi # clobbers INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) c = %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) d = %r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) e = %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) TBL = %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) a = %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) b = %ebx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) f = %r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) g = %r10d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) h = %r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) y0 = %r13d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) y1 = %r14d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) y2 = %r15d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) _INP_END_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) _INP_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) _XFER_SIZE = 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) _XMM_SAVE_SIZE = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) _INP_END = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) _INP = _INP_END + _INP_END_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) _XFER = _INP + _INP_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) _XMM_SAVE = _XFER + _XFER_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) # rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) # Rotate values of symbols X0...X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) .macro rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) X_ = X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) X0 = X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) X1 = X2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) X2 = X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) X3 = X_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) # ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) # Rotate values of symbols a...h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) .macro ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) TMP_ = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) h = g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) g = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) f = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) e = d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) d = c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) c = b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) b = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) a = TMP_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) .macro FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) ## compute s0 four at a time and s1 two at a time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) ## compute W[-16] + W[-7] 4 at a time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) movdqa X3, XTMP0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) mov e, y0 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) ror $(25-11), y0 # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) mov a, y1 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) palignr $4, X2, XTMP0 # XTMP0 = W[-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) ror $(22-13), y1 # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) xor e, y0 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) mov f, y2 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) movdqa X1, XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) xor a, y1 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) xor g, y2 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) and e, y2 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) ## compute s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) palignr $4, X0, XTMP1 # XTMP1 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) xor g, y2 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) add y0, y2 # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) mov a, y0 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) add y2, h # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) mov a, y2 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) pslld $(32-7), XTMP1 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) or c, y0 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) add h, d # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) and c, y2 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) psrld $7, XTMP2 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) and b, y0 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) add y1, h # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) ROTATE_ARGS #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) mov e, y0 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) mov a, y1 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) ror $(25-11), y0 # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) xor e, y0 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) mov f, y2 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) ror $(22-13), y1 # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) pslld $(32-18), XTMP3 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) xor a, y1 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) xor g, y2 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) psrld $18, XTMP2 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) and e, y2 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) pxor XTMP3, XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) xor g, y2 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) add y0, y2 # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) mov a, y0 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) add y2, h # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) mov a, y2 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) pxor XTMP4, XTMP1 # XTMP1 = s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) or c, y0 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) add h, d # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) and c, y2 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) ## compute low s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) and b, y0 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) add y1, h # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) mov e, y0 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) mov a, y1 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) ror $(25-11), y0 # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) xor e, y0 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) ror $(22-13), y1 # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) mov f, y2 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) xor a, y1 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) xor g, y2 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) and e, y2 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) xor g, y2 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) pxor XTMP3, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) add y0, y2 # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) mov a, y0 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) add y2, h # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) mov a, y2 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) or c, y0 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) add h, d # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) and c, y2 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) and b, y0 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) add y1, h # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) ## compute high s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) ROTATE_ARGS #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) mov e, y0 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) ror $(25-11), y0 # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) mov a, y1 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) ror $(22-13), y1 # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) xor e, y0 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) mov f, y2 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) xor a, y1 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) xor g, y2 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) and e, y2 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) xor g, y2 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) pxor XTMP3, XTMP2 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) add y0, y2 # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) pxor XTMP2, X0 # X0 = s1 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) mov a, y0 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) add y2, h # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) mov a, y2 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) or c, y0 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) add h, d # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) and c, y2 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) and b, y0 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) add y1, h # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) ## input is [rsp + _XFER + %1 * 4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) .macro DO_ROUND round
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) mov e, y0 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) ror $(25-11), y0 # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) mov a, y1 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) xor e, y0 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) ror $(22-13), y1 # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) mov f, y2 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) xor a, y1 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) xor g, y2 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) and e, y2 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) xor g, y2 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) add y0, y2 # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) offset = \round * 4 + _XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) add offset(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) mov a, y0 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) add y2, h # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) mov a, y2 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) or c, y0 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) add h, d # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) and c, y2 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) and b, y0 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) add y1, h # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) ## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) ## int blocks);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) ## arg 1 : pointer to state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) ## (struct sha256_state is assumed to begin with u32 state[8])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) ## arg 2 : pointer to input data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) ## arg 3 : Num blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) SYM_FUNC_START(sha256_transform_ssse3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) pushq %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) pushq %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) pushq %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) pushq %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) pushq %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) pushq %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) mov %rsp, %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) subq $STACK_SIZE, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) and $~15, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) shl $6, NUM_BLKS # convert to bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) jz done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) add INP, NUM_BLKS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) ## load initial digest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) mov 4*0(CTX), a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) mov 4*1(CTX), b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) mov 4*2(CTX), c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) mov 4*3(CTX), d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) mov 4*4(CTX), e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) mov 4*5(CTX), f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) mov 4*6(CTX), g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) mov 4*7(CTX), h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) movdqa _SHUF_00BA(%rip), SHUF_00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) movdqa _SHUF_DC00(%rip), SHUF_DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) loop0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) lea K256(%rip), TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) ## byte swap first 16 dwords
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) mov INP, _INP(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) ## schedule 48 input dwords, by doing 3 rounds of 16 each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) mov $3, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) loop1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) movdqa (TBL), XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) paddd X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) movdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) movdqa 1*16(TBL), XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) paddd X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) movdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) movdqa 2*16(TBL), XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) paddd X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) movdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) movdqa 3*16(TBL), XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) paddd X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) movdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) add $4*16, TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) sub $1, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) jne loop1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) mov $2, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) loop2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) paddd (TBL), X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) movdqa X0, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) DO_ROUND 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) DO_ROUND 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) DO_ROUND 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) DO_ROUND 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) paddd 1*16(TBL), X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) movdqa X1, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) add $2*16, TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) DO_ROUND 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) DO_ROUND 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) DO_ROUND 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) DO_ROUND 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) movdqa X2, X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) movdqa X3, X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) sub $1, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) jne loop2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) addm (4*0)(CTX),a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) addm (4*1)(CTX),b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) addm (4*2)(CTX),c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) addm (4*3)(CTX),d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) addm (4*4)(CTX),e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) addm (4*5)(CTX),f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) addm (4*6)(CTX),g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) addm (4*7)(CTX),h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) mov _INP(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) add $64, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) cmp _INP_END(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) jne loop0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) done_hash:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) mov %rbp, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) popq %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) popq %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) popq %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) popq %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) popq %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) popq %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) SYM_FUNC_END(sha256_transform_ssse3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) .section .rodata.cst256.K256, "aM", @progbits, 256
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) .align 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) K256:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) PSHUFFLE_BYTE_FLIP_MASK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) .octa 0x0c0d0e0f08090a0b0405060700010203
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) _SHUF_00BA:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) _SHUF_DC00:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF