^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) # Implement fast SHA-256 with AVX1 instructions. (x86_64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) # Copyright (C) 2013 Intel Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) # Authors:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) # James Guilford <james.guilford@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) # Kirk Yap <kirk.s.yap@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) # Tim Chen <tim.c.chen@linux.intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) # This software is available to you under a choice of one of two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) # licenses. You may choose to be licensed under the terms of the GNU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) # General Public License (GPL) Version 2, available from the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) # COPYING in the main directory of this source tree, or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) # OpenIB.org BSD license below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) # Redistribution and use in source and binary forms, with or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) # without modification, are permitted provided that the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) # conditions are met:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) # - Redistributions of source code must retain the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) # copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) # disclaimer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) # - Redistributions in binary form must reproduce the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) # copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) # disclaimer in the documentation and/or other materials
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) # provided with the distribution.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) # SOFTWARE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) # This code is described in an Intel White-Paper:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) # "Fast SHA-256 Implementations on Intel Architecture Processors"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) # To find it, surf to http://www.intel.com/p/en_US/embedded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) # and search for that title.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) # This code schedules 1 block at a time, with 4 lanes per block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) ## assume buffers not aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #define VMOVDQ vmovdqu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) ################################ Define Macros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) # addm [mem], reg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) # Add reg to mem using reg-mem add and store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) .macro addm p1 p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) add \p1, \p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) mov \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) .macro MY_ROR p1 p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) shld $(32-(\p1)), \p2, \p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) ################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) # Load xmm with mem and byte swap each dword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) .macro COPY_XMM_AND_BSWAP p1 p2 p3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) VMOVDQ \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) vpshufb \p3, \p1, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) ################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) X0 = %xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) X1 = %xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) X2 = %xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) X3 = %xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) XTMP0 = %xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) XTMP1 = %xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) XTMP2 = %xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) XTMP3 = %xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) XTMP4 = %xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) XFER = %xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) XTMP5 = %xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) BYTE_FLIP_MASK = %xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) NUM_BLKS = %rdx # 3rd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) INP = %rsi # 2nd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) CTX = %rdi # 1st arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) SRND = %rsi # clobbers INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) c = %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) d = %r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) e = %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) TBL = %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) a = %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) b = %ebx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) f = %r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) g = %r10d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) h = %r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) y0 = %r13d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) y1 = %r14d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) y2 = %r15d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) _INP_END_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) _INP_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) _XFER_SIZE = 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) _XMM_SAVE_SIZE = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) _INP_END = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) _INP = _INP_END + _INP_END_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) _XFER = _INP + _INP_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) _XMM_SAVE = _XFER + _XFER_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) # rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) # Rotate values of symbols X0...X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) .macro rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) X_ = X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) X0 = X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) X1 = X2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) X2 = X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) X3 = X_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) # ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) # Rotate values of symbols a...h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) .macro ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) TMP_ = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) h = g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) g = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) f = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) e = d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) d = c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) c = b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) b = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) a = TMP_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) .macro FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) ## compute s0 four at a time and s1 two at a time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) ## compute W[-16] + W[-7] 4 at a time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) mov e, y0 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) MY_ROR (25-11), y0 # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) mov a, y1 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) MY_ROR (22-13), y1 # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) xor e, y0 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) mov f, y2 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) xor a, y1 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) xor g, y2 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) and e, y2 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) ## compute s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) xor g, y2 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) add y0, y2 # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) mov a, y0 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) add y2, h # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) mov a, y2 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) vpsrld $7, XTMP1, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) or c, y0 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) add h, d # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) and c, y2 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) vpslld $(32-7), XTMP1, XTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) and b, y0 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) add y1, h # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) mov e, y0 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) mov a, y1 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) MY_ROR (25-11), y0 # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) xor e, y0 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) mov f, y2 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) MY_ROR (22-13), y1 # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) vpsrld $18, XTMP1, XTMP2 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) xor a, y1 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) xor g, y2 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) and e, y2 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) vpslld $(32-18), XTMP1, XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) xor g, y2 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) vpxor XTMP1, XTMP3, XTMP3 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) add y0, y2 # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) mov a, y0 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) add y2, h # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) mov a, y2 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) or c, y0 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) add h, d # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) and c, y2 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) ## compute low s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) and b, y0 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) add y1, h # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) mov e, y0 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) mov a, y1 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) MY_ROR (25-11), y0 # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) xor e, y0 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) MY_ROR (22-13), y1 # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) mov f, y2 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) xor a, y1 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) xor g, y2 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) and e, y2 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) xor g, y2 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) vpxor XTMP3, XTMP2, XTMP2 #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) add y0, y2 # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) mov a, y0 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) add y2, h # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) mov a, y2 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) or c, y0 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) add h, d # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) and c, y2 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) and b, y0 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) add y1, h # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) ## compute high s1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) mov e, y0 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) MY_ROR (25-11), y0 # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) mov a, y1 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) MY_ROR (22-13), y1 # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) xor e, y0 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) mov f, y2 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) xor a, y1 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) xor g, y2 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) and e, y2 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) xor g, y2 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) vpxor XTMP3, XTMP2, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) add y0, y2 # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) mov a, y0 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) add y2, h # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) mov a, y2 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) or c, y0 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) add h, d # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) and c, y2 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) and b, y0 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) add y1, h # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) ## input is [rsp + _XFER + %1 * 4]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) .macro DO_ROUND round
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) mov e, y0 # y0 = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) MY_ROR (25-11), y0 # y0 = e >> (25-11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) mov a, y1 # y1 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) xor e, y0 # y0 = e ^ (e >> (25-11))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) MY_ROR (22-13), y1 # y1 = a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) mov f, y2 # y2 = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) xor a, y1 # y1 = a ^ (a >> (22-13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) xor g, y2 # y2 = f^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) and e, y2 # y2 = (f^g)&e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) xor g, y2 # y2 = CH = ((f^g)&e)^g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) add y0, y2 # y2 = S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) offset = \round * 4 + _XFER #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) add offset(%rsp), y2 # y2 = k + w + S1 + CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) mov a, y0 # y0 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) add y2, h # h = h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) mov a, y2 # y2 = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) or c, y0 # y0 = a|c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) add h, d # d = d + h + S1 + CH + k + w
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) and c, y2 # y2 = a&c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) and b, y0 # y0 = (a|c)&b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) add y1, h # h = h + S1 + CH + k + w + S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) ## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) ## arg 1 : pointer to state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) ## arg 2 : pointer to input data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) ## arg 3 : Num blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) SYM_FUNC_START(sha256_transform_avx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) pushq %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) pushq %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) pushq %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) pushq %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) pushq %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) pushq %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) movq %rsp, %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) subq $STACK_SIZE, %rsp # allocate stack space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) and $~15, %rsp # align stack pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) shl $6, NUM_BLKS # convert to bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) jz done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) add INP, NUM_BLKS # pointer to end of data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) mov NUM_BLKS, _INP_END(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) ## load initial digest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) mov 4*0(CTX), a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) mov 4*1(CTX), b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) mov 4*2(CTX), c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) mov 4*3(CTX), d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) mov 4*4(CTX), e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) mov 4*5(CTX), f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) mov 4*6(CTX), g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) mov 4*7(CTX), h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) vmovdqa _SHUF_00BA(%rip), SHUF_00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) vmovdqa _SHUF_DC00(%rip), SHUF_DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) loop0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) lea K256(%rip), TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) ## byte swap first 16 dwords
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) mov INP, _INP(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) ## schedule 48 input dwords, by doing 3 rounds of 16 each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) mov $3, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) loop1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) vpaddd (TBL), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) vpaddd 1*16(TBL), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) vpaddd 2*16(TBL), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) vpaddd 3*16(TBL), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) add $4*16, TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) sub $1, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) jne loop1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) mov $2, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) loop2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) vpaddd (TBL), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) DO_ROUND 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) DO_ROUND 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) DO_ROUND 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) DO_ROUND 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) vpaddd 1*16(TBL), X1, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) vmovdqa XFER, _XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) add $2*16, TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) DO_ROUND 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) DO_ROUND 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) DO_ROUND 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) DO_ROUND 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) vmovdqa X2, X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) vmovdqa X3, X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) sub $1, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) jne loop2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) addm (4*0)(CTX),a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) addm (4*1)(CTX),b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) addm (4*2)(CTX),c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) addm (4*3)(CTX),d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) addm (4*4)(CTX),e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) addm (4*5)(CTX),f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) addm (4*6)(CTX),g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) addm (4*7)(CTX),h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) mov _INP(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) add $64, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) cmp _INP_END(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) jne loop0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) done_hash:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) mov %rbp, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) popq %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) popq %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) popq %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) popq %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) popq %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) popq %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) SYM_FUNC_END(sha256_transform_avx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) .section .rodata.cst256.K256, "aM", @progbits, 256
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) .align 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) K256:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) PSHUFFLE_BYTE_FLIP_MASK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) .octa 0x0c0d0e0f08090a0b0405060700010203
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) _SHUF_00BA:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) _SHUF_DC00:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF