^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) # Implement fast SHA-512 with AVX2 instructions. (x86_64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) # Copyright (C) 2013 Intel Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) # Authors:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) # James Guilford <james.guilford@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) # Kirk Yap <kirk.s.yap@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) # David Cote <david.m.cote@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) # Tim Chen <tim.c.chen@linux.intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) # This software is available to you under a choice of one of two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) # licenses. You may choose to be licensed under the terms of the GNU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) # General Public License (GPL) Version 2, available from the file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) # COPYING in the main directory of this source tree, or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) # OpenIB.org BSD license below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) # Redistribution and use in source and binary forms, with or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) # without modification, are permitted provided that the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) # conditions are met:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) # - Redistributions of source code must retain the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) # copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) # disclaimer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) # - Redistributions in binary form must reproduce the above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) # copyright notice, this list of conditions and the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) # disclaimer in the documentation and/or other materials
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) # provided with the distribution.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) # SOFTWARE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) # This code is described in an Intel White-Paper:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) # "Fast SHA-512 Implementations on Intel Architecture Processors"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) # To find it, surf to http://www.intel.com/p/en_US/embedded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) # and search for that title.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) # This code schedules 1 blocks at a time, with 4 lanes per block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) # Virtual Registers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) Y_0 = %ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) Y_1 = %ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) Y_2 = %ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) Y_3 = %ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) YTMP0 = %ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) YTMP1 = %ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) YTMP2 = %ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) YTMP3 = %ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) YTMP4 = %ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) XFER = YTMP0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) BYTE_FLIP_MASK = %ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) # 1st arg is %rdi, which is saved to the stack and accessed later via %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) CTX1 = %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) CTX2 = %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) # 2nd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) INP = %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) # 3rd arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) NUM_BLKS = %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) c = %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) d = %r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) e = %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) y3 = %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) TBL = %rdi # clobbers CTX1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) a = %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) b = %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) f = %r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) g = %r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) h = %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) old_h = %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) T1 = %r12 # clobbers CTX2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) y0 = %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) y1 = %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) y2 = %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) # Local variables (stack frame)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) XFER_SIZE = 4*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) SRND_SIZE = 1*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) INP_SIZE = 1*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) INPEND_SIZE = 1*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) CTX_SIZE = 1*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) RSPSAVE_SIZE = 1*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) GPRSAVE_SIZE = 5*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) frame_XFER = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) frame_SRND = frame_XFER + XFER_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) frame_INP = frame_SRND + SRND_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) frame_INPEND = frame_INP + INP_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) frame_CTX = frame_INPEND + INPEND_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) frame_RSPSAVE = frame_CTX + CTX_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) frame_size = frame_GPRSAVE + GPRSAVE_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) ## assume buffers not aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) #define VMOVDQ vmovdqu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) # addm [mem], reg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) # Add reg to mem using reg-mem add and store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) .macro addm p1 p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) add \p1, \p2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) mov \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) # COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) # Load ymm with mem and byte swap each dword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) .macro COPY_YMM_AND_BSWAP p1 p2 p3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) VMOVDQ \p2, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) vpshufb \p3, \p1, \p1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) # rotate_Ys
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) # Rotate values of symbols Y0...Y3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) .macro rotate_Ys
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) Y_ = Y_0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) Y_0 = Y_1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) Y_1 = Y_2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) Y_2 = Y_3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) Y_3 = Y_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) # RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) .macro RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) # Rotate symbols a..h right
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) old_h = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) TMP_ = h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) h = g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) g = f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) f = e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) e = d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) d = c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) c = b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) b = a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) a = TMP_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) # macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) # YDST = {YSRC1, YSRC2} >> RVAL*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) .macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) .macro FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) ################################### RND N + 0 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) # Extract w[t-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) # Calculate w[t-16] + w[t-7]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) # Extract w[t-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) # Calculate sigma0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) # Calculate w[t-15] ror 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) vpsrlq $1, YTMP1, YTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) vpsllq $(64-1), YTMP1, YTMP3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) # Calculate w[t-15] shr 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) rorx $41, e, y0 # y0 = e >> 41 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) rorx $18, e, y1 # y1 = e >> 18 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) add frame_XFER(%rsp),h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) rorx $34, a, T1 # T1 = a >> 34 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) rorx $14, e, y1 # y1 = (e >> 14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) rorx $39, a, y1 # y1 = a >> 39 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) rorx $28, a, T1 # T1 = (a >> 28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) add y3, h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) ################################### RND N + 1 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) # Calculate w[t-15] ror 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) vpsrlq $8, YTMP1, YTMP2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) vpsllq $(64-8), YTMP1, YTMP1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) # XOR the three components
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) # Add three components, w[t-16], w[t-7] and sigma0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) # Move to appropriate lanes for calculating w[16] and w[17]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) # Move to appropriate lanes for calculating w[18] and w[19]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) # Calculate w[16] and w[17] in both 128 bit lanes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) rorx $41, e, y0 # y0 = e >> 41 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) rorx $18, e, y1 # y1 = e >> 18 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) rorx $34, a, T1 # T1 = a >> 34 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) rorx $14, e, y1 # y1 = (e >> 14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) rorx $39, a, y1 # y1 = a >> 39 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) rorx $28, a, T1 # T1 = (a >> 28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) add y3, h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) ################################### RND N + 2 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) # Add sigma1 to the other compunents to get w[16] and w[17]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) rorx $41, e, y0 # y0 = e >> 41 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) rorx $18, e, y1 # y1 = e >> 18 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) rorx $34, a, T1 # T1 = a >> 34 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) rorx $14, e, y1 # y1 = (e >> 14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) rorx $39, a, y1 # y1 = a >> 39 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) rorx $28, a, T1 # T1 = (a >> 28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) add y3, h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) ################################### RND N + 3 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) # to newly calculated sigma1 to get w[18] and w[19]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) # Form w[19, w[18], w17], w[16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) rorx $41, e, y0 # y0 = e >> 41 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) rorx $18, e, y1 # y1 = e >> 18 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) rorx $34, a, T1 # T1 = a >> 34 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) rorx $14, e, y1 # y1 = (e >> 14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) rorx $39, a, y1 # y1 = a >> 39 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) rorx $28, a, T1 # T1 = (a >> 28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) add y3, h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) rotate_Ys
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) .macro DO_4ROUNDS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) ################################### RND N + 0 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) rorx $41, e, y0 # y0 = e >> 41 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) rorx $18, e, y1 # y1 = e >> 18 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) rorx $14, e, y1 # y1 = (e >> 14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) rorx $34, a, T1 # T1 = a >> 34 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) rorx $39, a, y1 # y1 = a >> 39 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) rorx $28, a, T1 # T1 = (a >> 28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) add frame_XFER(%rsp), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) ################################### RND N + 1 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) rorx $41, e, y0 # y0 = e >> 41 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) rorx $18, e, y1 # y1 = e >> 18 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) rorx $14, e, y1 # y1 = (e >> 14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) add y3, old_h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) rorx $34, a, T1 # T1 = a >> 34 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) rorx $39, a, y1 # y1 = a >> 39 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) rorx $28, a, T1 # T1 = (a >> 28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) ################################### RND N + 2 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) rorx $41, e, y0 # y0 = e >> 41 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) rorx $18, e, y1 # y1 = e >> 18 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) rorx $14, e, y1 # y1 = (e >> 14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) add y3, old_h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) rorx $34, a, T1 # T1 = a >> 34 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) rorx $39, a, y1 # y1 = a >> 39 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) rorx $28, a, T1 # T1 = (a >> 28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) ################################### RND N + 3 #########################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) mov f, y2 # y2 = f # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) rorx $41, e, y0 # y0 = e >> 41 # S1A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) rorx $18, e, y1 # y1 = e >> 18 # S1B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) xor g, y2 # y2 = f^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) rorx $14, e, y1 # y1 = (e >> 14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) and e, y2 # y2 = (f^g)&e # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) add y3, old_h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) rorx $34, a, T1 # T1 = a >> 34 # S0B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) rorx $39, a, y1 # y1 = a >> 39 # S0A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) mov a, y3 # y3 = a # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) rorx $28, a, T1 # T1 = (a >> 28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) or c, y3 # y3 = a|c # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) mov a, T1 # T1 = a # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) and b, y3 # y3 = (a|c)&b # MAJA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) and c, T1 # T1 = a&c # MAJB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) add y0, y2 # y2 = S1 + CH # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) add h, d # d = k + w + h + d # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) add y1, h # h = k + w + h + S0 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) add y3, h # h = t1 + S0 + MAJ # --
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) RotateState
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) # void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) # Purpose: Updates the SHA512 digest stored at "state" with the message
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) # stored in "data".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) # The size of the message pointed to by "data" must be an integer multiple
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) # of SHA512 message blocks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) # "blocks" is the message length in SHA512 blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) SYM_FUNC_START(sha512_transform_rorx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) # Allocate Stack Space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) mov %rsp, %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) sub $frame_size, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) and $~(0x20 - 1), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) mov %rax, frame_RSPSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) # Save GPRs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) mov %rbx, 8*0+frame_GPRSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) mov %r12, 8*1+frame_GPRSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) mov %r13, 8*2+frame_GPRSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) mov %r14, 8*3+frame_GPRSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) mov %r15, 8*4+frame_GPRSAVE(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) shl $7, NUM_BLKS # convert to bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) jz done_hash
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) add INP, NUM_BLKS # pointer to end of data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) mov NUM_BLKS, frame_INPEND(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) ## load initial digest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) mov 8*0(CTX1), a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) mov 8*1(CTX1), b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) mov 8*2(CTX1), c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) mov 8*3(CTX1), d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) mov 8*4(CTX1), e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) mov 8*5(CTX1), f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) mov 8*6(CTX1), g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) mov 8*7(CTX1), h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) # save %rdi (CTX) before it gets clobbered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) mov %rdi, frame_CTX(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) loop0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) lea K512(%rip), TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) ## byte swap first 16 dwords
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) mov INP, frame_INP(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) ## schedule 64 input dwords, by doing 12 rounds of 4 each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) movq $4, frame_SRND(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) loop1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) vpaddq (TBL), Y_0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) vpaddq 1*32(TBL), Y_0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) vpaddq 2*32(TBL), Y_0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) vpaddq 3*32(TBL), Y_0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) add $(4*32), TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) FOUR_ROUNDS_AND_SCHED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) subq $1, frame_SRND(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) jne loop1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) movq $2, frame_SRND(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) loop2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) vpaddq (TBL), Y_0, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) DO_4ROUNDS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) vpaddq 1*32(TBL), Y_1, XFER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) vmovdqa XFER, frame_XFER(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) add $(2*32), TBL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) DO_4ROUNDS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) vmovdqa Y_2, Y_0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) vmovdqa Y_3, Y_1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) subq $1, frame_SRND(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) jne loop2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) mov frame_CTX(%rsp), CTX2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) addm 8*0(CTX2), a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) addm 8*1(CTX2), b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) addm 8*2(CTX2), c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) addm 8*3(CTX2), d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) addm 8*4(CTX2), e
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) addm 8*5(CTX2), f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) addm 8*6(CTX2), g
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) addm 8*7(CTX2), h
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) mov frame_INP(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) add $128, INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) cmp frame_INPEND(%rsp), INP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) jne loop0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) done_hash:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) # Restore GPRs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) mov 8*0+frame_GPRSAVE(%rsp), %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) mov 8*1+frame_GPRSAVE(%rsp), %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) mov 8*2+frame_GPRSAVE(%rsp), %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) mov 8*3+frame_GPRSAVE(%rsp), %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) mov 8*4+frame_GPRSAVE(%rsp), %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) # Restore Stack Pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) mov frame_RSPSAVE(%rsp), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) SYM_FUNC_END(sha512_transform_rorx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) ########################################################################
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) ### Binary Data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) # Mergeable 640-byte rodata section. This allows linker to merge the table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) # with other, exactly the same 640-byte fragment of another rodata section
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) # (if such section exists).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) .section .rodata.cst640.K512, "aM", @progbits, 640
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) .align 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) # K[t] used in SHA512 hashing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) K512:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) .quad 0x428a2f98d728ae22,0x7137449123ef65cd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) .quad 0x3956c25bf348b538,0x59f111f1b605d019
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) .quad 0xd807aa98a3030242,0x12835b0145706fbe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) .quad 0x9bdc06a725c71235,0xc19bf174cf692694
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) .quad 0x983e5152ee66dfab,0xa831c66d2db43210
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) .quad 0x06ca6351e003826f,0x142929670a0e6e70
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) .quad 0x81c2c92e47edaee6,0x92722c851482353b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) .quad 0xd192e819d6ef5218,0xd69906245565a910
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) .quad 0xf40e35855771202a,0x106aa07032bbd1b8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) .quad 0x90befffa23631e28,0xa4506cebde82bde9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) .quad 0xca273eceea26619c,0xd186b8c721c0c207
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) .quad 0x113f9804bef90dae,0x1b710b35131c471b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) .quad 0x28db77f523047d84,0x32caab7b40c72493
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) PSHUFFLE_BYTE_FLIP_MASK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) .octa 0x08090a0b0c0d0e0f0001020304050607
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) .octa 0x18191a1b1c1d1e1f1011121314151617
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) .section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) MASK_YMM_LO:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) .octa 0x00000000000000000000000000000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF