^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) # Implement fast SHA-256 with AVX2 instructions. (x86_64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) # Copyright (C) 2013 Intel Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) # Authors:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) # James Guilford <james.guilford@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) # Kirk Yap <kirk.s.yap@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) # Tim Chen <tim.c.chen@linux.intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) # This software is available to you under a choice of one of two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) # licenses. You may choose to be licensed under the terms of the GNU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) # General Public License (GPL) Version 2, available from the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) # COPYING in the main directory of this source tree, or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) # OpenIB.org BSD license below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) # Redistribution and use in source and binary forms, with or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) # without modification, are permitted provided that the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) # conditions are met:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) # - Redistributions of source code must retain the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) # copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) # disclaimer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) # - Redistributions in binary form must reproduce the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) # copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) # disclaimer in the documentation and/or other materials
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) # provided with the distribution.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) # SOFTWARE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) # This code is described in an Intel White-Paper:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) # "Fast SHA-256 Implementations on Intel Architecture Processors"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) # To find it, surf to http://www.intel.com/p/en_US/embedded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) # and search for that title.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) # This code schedules 2 blocks at a time, with 4 lanes per block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) ## assume buffers not aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) #define VMOVDQ vmovdqu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) ################################ Define Macros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) # addm [mem], reg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) # Add reg to mem using reg-mem add and store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) .macro addm p1 p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) add \p1, \p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) mov \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) ################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) X0 = %ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) X1 = %ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) X2 = %ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) X3 = %ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) # XMM versions of above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) XWORD0 = %xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) XWORD1 = %xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) XWORD2 = %xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) XWORD3 = %xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) XTMP0 = %ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) XTMP1 = %ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) XTMP2 = %ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) XTMP3 = %ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) XTMP4 = %ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) XFER = %ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) XTMP5 = %ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) BYTE_FLIP_MASK = %ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) NUM_BLKS = %rdx # 3rd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) INP = %rsi # 2nd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) CTX = %rdi # 1st arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) c = %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) d = %r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) e = %edx # clobbers NUM_BLKS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) y3 = %esi # clobbers INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) SRND = CTX # SRND is same register as CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) a = %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) b = %ebx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) f = %r9d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) g = %r10d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) h = %r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) old_h = %r11d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) T1 = %r12d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) y0 = %r13d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) y1 = %r14d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) y2 = %r15d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) _XMM_SAVE_SIZE = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) _INP_END_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) _INP_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) _CTX_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) _RSP_SIZE = 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) _XFER = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) _XMM_SAVE = _XFER + _XFER_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) _INP = _INP_END + _INP_END_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) _CTX = _INP + _INP_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) _RSP = _CTX + _CTX_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) STACK_SIZE = _RSP + _RSP_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) # rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) # Rotate values of symbols X0...X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) .macro rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) X_ = X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) X0 = X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) X1 = X2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) X2 = X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) X3 = X_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) # ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) # Rotate values of symbols a...h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) .macro ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) old_h = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) TMP_ = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) h = g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) g = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) f = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) e = d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) d = c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) c = b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) b = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) a = TMP_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) .macro FOUR_ROUNDS_AND_SCHED disp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) ################################### RND N + 0 ############################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) rorx $25, e, y0 # y0 = e >> 25 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) rorx $11, e, y1 # y1 = e >> 11 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) addl \disp(%rsp, SRND), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) rorx $13, a, T1 # T1 = a >> 13 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) rorx $6, e, y1 # y1 = (e >> 6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) rorx $22, a, y1 # y1 = a >> 22 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) rorx $2, a, T1 # T1 = (a >> 2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) vpsrld $7, XTMP1, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) vpslld $(32-7), XTMP1, XTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) vpsrld $18, XTMP1, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) add y3, h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) ################################### RND N + 1 ############################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) rorx $25, e, y0 # y0 = e >> 25 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) rorx $11, e, y1 # y1 = e >> 11 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) offset = \disp + 1*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) addl offset(%rsp, SRND), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) rorx $13, a, T1 # T1 = a >> 13 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) rorx $6, e, y1 # y1 = (e >> 6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) rorx $22, a, y1 # y1 = a >> 22 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) vpslld $(32-18), XTMP1, XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) vpxor XTMP1, XTMP3, XTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) rorx $2, a, T1 # T1 = (a >> 2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) add y3, h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) ################################### RND N + 2 ############################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) rorx $25, e, y0 # y0 = e >> 25 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) offset = \disp + 2*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) addl offset(%rsp, SRND), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) rorx $11, e, y1 # y1 = e >> 11 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) rorx $13, a, T1 # T1 = a >> 13 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) rorx $6, e, y1 # y1 = (e >> 6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) vpxor XTMP3, XTMP2, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) rorx $22, a, y1 # y1 = a >> 22 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) rorx $2, a ,T1 # T1 = (a >> 2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) add y1,h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) add y3,h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) ################################### RND N + 3 ############################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) rorx $25, e, y0 # y0 = e >> 25 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) rorx $11, e, y1 # y1 = e >> 11 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) offset = \disp + 3*4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) addl offset(%rsp, SRND), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) rorx $13, a, T1 # T1 = a >> 13 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) rorx $6, e, y1 # y1 = (e >> 6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) vpxor XTMP3, XTMP2, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) rorx $22, a, y1 # y1 = a >> 22 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) rorx $2, a, T1 # T1 = (a >> 2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) add y3, h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) rotate_Xs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) .macro DO_4ROUNDS disp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) ################################### RND N + 0 ###########################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) rorx $25, e, y0 # y0 = e >> 25 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) rorx $11, e, y1 # y1 = e >> 11 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) rorx $6, e, y1 # y1 = (e >> 6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) rorx $13, a, T1 # T1 = a >> 13 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) rorx $22, a, y1 # y1 = a >> 22 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) rorx $2, a, T1 # T1 = (a >> 2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) addl \disp(%rsp, SRND), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) ################################### RND N + 1 ###########################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) rorx $25, e, y0 # y0 = e >> 25 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) rorx $11, e, y1 # y1 = e >> 11 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) rorx $6, e, y1 # y1 = (e >> 6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) add y3, old_h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) rorx $13, a, T1 # T1 = a >> 13 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) rorx $22, a, y1 # y1 = a >> 22 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) rorx $2, a, T1 # T1 = (a >> 2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) offset = 4*1 + \disp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) addl offset(%rsp, SRND), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) ################################### RND N + 2 ##############################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) rorx $25, e, y0 # y0 = e >> 25 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) rorx $11, e, y1 # y1 = e >> 11 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) rorx $6, e, y1 # y1 = (e >> 6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) add y3, old_h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) rorx $13, a, T1 # T1 = a >> 13 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) rorx $22, a, y1 # y1 = a >> 22 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) rorx $2, a, T1 # T1 = (a >> 2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) offset = 4*2 + \disp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) addl offset(%rsp, SRND), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) ################################### RND N + 3 ###########################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) rorx $25, e, y0 # y0 = e >> 25 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) rorx $11, e, y1 # y1 = e >> 11 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) rorx $6, e, y1 # y1 = (e >> 6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) add y3, old_h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) rorx $13, a, T1 # T1 = a >> 13 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) rorx $22, a, y1 # y1 = a >> 22 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) rorx $2, a, T1 # T1 = (a >> 2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) offset = 4*3 + \disp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) addl offset(%rsp, SRND), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) add y3, h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) ROTATE_ARGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) ## arg 1 : pointer to state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) ## arg 2 : pointer to input data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) ## arg 3 : Num blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) SYM_FUNC_START(sha256_transform_rorx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) pushq %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) pushq %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) pushq %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) pushq %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) pushq %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) mov %rsp, %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) subq $STACK_SIZE, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) and $-32, %rsp # align rsp to 32 byte boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) mov %rax, _RSP(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) shl $6, NUM_BLKS # convert to bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) jz done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) mov NUM_BLKS, _INP_END(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) cmp NUM_BLKS, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) je only_one_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) ## load initial digest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) mov (CTX), a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) mov 4*1(CTX), b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) mov 4*2(CTX), c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) mov 4*3(CTX), d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) mov 4*4(CTX), e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) mov 4*5(CTX), f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) mov 4*6(CTX), g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) mov 4*7(CTX), h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) vmovdqa _SHUF_00BA(%rip), SHUF_00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) vmovdqa _SHUF_DC00(%rip), SHUF_DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) mov CTX, _CTX(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) loop0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) ## Load first 16 dwords from two blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) VMOVDQ 0*32(INP),XTMP0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) VMOVDQ 1*32(INP),XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) VMOVDQ 2*32(INP),XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) VMOVDQ 3*32(INP),XTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) ## byte swap data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) ## transpose data into high/low halves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) vperm2i128 $0x20, XTMP2, XTMP0, X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) vperm2i128 $0x31, XTMP2, XTMP0, X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) vperm2i128 $0x20, XTMP3, XTMP1, X2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) vperm2i128 $0x31, XTMP3, XTMP1, X3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) last_block_enter:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) add $64, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) mov INP, _INP(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) ## schedule 48 input dwords, by doing 3 rounds of 12 each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) xor SRND, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) loop1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) vpaddd K256+0*32(SRND), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) FOUR_ROUNDS_AND_SCHED _XFER + 0*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) vpaddd K256+1*32(SRND), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) FOUR_ROUNDS_AND_SCHED _XFER + 1*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) vpaddd K256+2*32(SRND), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) FOUR_ROUNDS_AND_SCHED _XFER + 2*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) vpaddd K256+3*32(SRND), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) FOUR_ROUNDS_AND_SCHED _XFER + 3*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) add $4*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) cmp $3*4*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) jb loop1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) loop2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) ## Do last 16 rounds with no scheduling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) vpaddd K256+0*32(SRND), X0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) DO_4ROUNDS _XFER + 0*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) vpaddd K256+1*32(SRND), X1, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) DO_4ROUNDS _XFER + 1*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) add $2*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) vmovdqa X2, X0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) vmovdqa X3, X1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) cmp $4*4*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) jb loop2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) mov _CTX(%rsp), CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) mov _INP(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) addm (4*0)(CTX),a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) addm (4*1)(CTX),b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) addm (4*2)(CTX),c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) addm (4*3)(CTX),d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) addm (4*4)(CTX),e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) addm (4*5)(CTX),f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) addm (4*6)(CTX),g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) addm (4*7)(CTX),h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) cmp _INP_END(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) ja done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) #### Do second block using previously scheduled results
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) xor SRND, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) loop3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) DO_4ROUNDS _XFER + 0*32 + 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) DO_4ROUNDS _XFER + 1*32 + 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) add $2*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) cmp $4*4*32, SRND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) jb loop3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) mov _CTX(%rsp), CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) mov _INP(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) add $64, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) addm (4*0)(CTX),a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) addm (4*1)(CTX),b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) addm (4*2)(CTX),c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) addm (4*3)(CTX),d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) addm (4*4)(CTX),e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) addm (4*5)(CTX),f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) addm (4*6)(CTX),g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) addm (4*7)(CTX),h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) cmp _INP_END(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) jb loop0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) ja done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) do_last_block:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) VMOVDQ 0*16(INP),XWORD0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) VMOVDQ 1*16(INP),XWORD1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) VMOVDQ 2*16(INP),XWORD2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) VMOVDQ 3*16(INP),XWORD3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) jmp last_block_enter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) only_one_block:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) ## load initial digest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) mov (4*0)(CTX),a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) mov (4*1)(CTX),b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) mov (4*2)(CTX),c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) mov (4*3)(CTX),d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) mov (4*4)(CTX),e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) mov (4*5)(CTX),f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) mov (4*6)(CTX),g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) mov (4*7)(CTX),h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) vmovdqa _SHUF_00BA(%rip), SHUF_00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) vmovdqa _SHUF_DC00(%rip), SHUF_DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) mov CTX, _CTX(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) jmp do_last_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) done_hash:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) mov _RSP(%rsp), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) popq %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) popq %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) popq %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) popq %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) popq %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) SYM_FUNC_END(sha256_transform_rorx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) .section .rodata.cst512.K256, "aM", @progbits, 512
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) .align 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) K256:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) PSHUFFLE_BYTE_FLIP_MASK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) # shuffle xBxA -> 00BA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) .section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) _SHUF_00BA:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) # shuffle xDxC -> DC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) .section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) _SHUF_DC00:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF