^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0-or-later */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2015 Martin Willi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) .section .rodata.cst32.ROT8, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) .octa 0x0e0d0c0f0a09080b0605040702010003
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) .section .rodata.cst32.ROT16, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) .octa 0x0d0c0f0e09080b0a0504070601000302
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) .section .rodata.cst32.CTRINC, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) CTRINC: .octa 0x00000003000000020000000100000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) .octa 0x00000007000000060000000500000004
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) CTR2BL: .octa 0x00000000000000000000000000000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) .octa 0x00000000000000000000000000000001
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) CTR4BL: .octa 0x00000000000000000000000000000002
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) .octa 0x00000000000000000000000000000003
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) SYM_FUNC_START(chacha_2block_xor_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) # %rdi: Input state matrix, s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) # %rsi: up to 2 data blocks output, o
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) # %rdx: up to 2 data blocks input, i
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) # %rcx: input/output length in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) # %r8d: nrounds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) # This function encrypts two ChaCha blocks by loading the state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) # matrix twice across four AVX registers. It performs matrix operations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) # on four words in each matrix in parallel, but requires shuffling to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) # rearrange the words after each round.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) # x0..3[0-2] = s0..3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) vbroadcasti128 0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) vbroadcasti128 0x10(%rdi),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) vbroadcasti128 0x20(%rdi),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) vbroadcasti128 0x30(%rdi),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) vpaddd CTR2BL(%rip),%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) vmovdqa %ymm0,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) vmovdqa %ymm1,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) vmovdqa %ymm2,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) vmovdqa %ymm3,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) vmovdqa ROT8(%rip),%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) vmovdqa ROT16(%rip),%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) mov %rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) .Ldoubleround:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) vpxor %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) vpshufb %ymm5,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) vpxor %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) vmovdqa %ymm1,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) vpslld $12,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) vpsrld $20,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) vpor %ymm6,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) vpxor %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) vpshufb %ymm4,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) vpxor %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) vmovdqa %ymm1,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) vpslld $7,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) vpsrld $25,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) vpor %ymm7,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) vpshufd $0x39,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) vpshufd $0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) vpshufd $0x93,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) vpxor %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) vpshufb %ymm5,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) vpxor %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) vmovdqa %ymm1,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) vpslld $12,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) vpsrld $20,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) vpor %ymm6,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) vpxor %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) vpshufb %ymm4,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) vpxor %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) vmovdqa %ymm1,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) vpslld $7,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) vpsrld $25,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) vpor %ymm7,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) vpshufd $0x93,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) vpshufd $0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) vpshufd $0x39,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) sub $2,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) jnz .Ldoubleround
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) # o0 = i0 ^ (x0 + s0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) vpaddd %ymm8,%ymm0,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) cmp $0x10,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) vpxor 0x00(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) vmovdqu %xmm6,0x00(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) vextracti128 $1,%ymm7,%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) # o1 = i1 ^ (x1 + s1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) vpaddd %ymm9,%ymm1,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) cmp $0x20,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) vpxor 0x10(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) vmovdqu %xmm6,0x10(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) vextracti128 $1,%ymm7,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) # o2 = i2 ^ (x2 + s2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) vpaddd %ymm10,%ymm2,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) cmp $0x30,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) vpxor 0x20(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) vmovdqu %xmm6,0x20(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) vextracti128 $1,%ymm7,%xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) # o3 = i3 ^ (x3 + s3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) vpaddd %ymm11,%ymm3,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) cmp $0x40,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) vpxor 0x30(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) vmovdqu %xmm6,0x30(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) vextracti128 $1,%ymm7,%xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) # xor and write second block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) vmovdqa %xmm0,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) cmp $0x50,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) vpxor 0x40(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) vmovdqu %xmm6,0x40(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) vmovdqa %xmm1,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) cmp $0x60,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) vpxor 0x50(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) vmovdqu %xmm6,0x50(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) vmovdqa %xmm2,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) cmp $0x70,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) vpxor 0x60(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) vmovdqu %xmm6,0x60(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) vmovdqa %xmm3,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) cmp $0x80,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) vpxor 0x70(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) vmovdqu %xmm6,0x70(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) .Ldone2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) .Lxorpart2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) # xor remaining bytes from partial register into output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) mov %rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) and $0x0f,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) jz .Ldone2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) and $~0x0f,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) mov %rsi,%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) lea 8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) sub $0x10,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) and $~31,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) lea (%rdx,%rax),%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) mov %rsp,%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) mov %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) vpxor 0x00(%rsp),%xmm7,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) vmovdqa %xmm7,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) mov %rsp,%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) lea (%r11,%rax),%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) mov %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) lea -8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) jmp .Ldone2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) SYM_FUNC_END(chacha_2block_xor_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) SYM_FUNC_START(chacha_4block_xor_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) # %rdi: Input state matrix, s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) # %rsi: up to 4 data blocks output, o
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) # %rdx: up to 4 data blocks input, i
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) # %rcx: input/output length in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) # %r8d: nrounds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) # This function encrypts four ChaCha blocks by loading the state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) # matrix four times across eight AVX registers. It performs matrix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) # operations on four words in two matrices in parallel, sequentially
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) # to the operations on the four words of the other two matrices. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) # required word shuffling has a rather high latency, we can do the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) # arithmetic on two matrix-pairs without much slowdown.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) # x0..3[0-4] = s0..3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) vbroadcasti128 0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) vbroadcasti128 0x10(%rdi),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) vbroadcasti128 0x20(%rdi),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) vbroadcasti128 0x30(%rdi),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) vmovdqa %ymm0,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) vmovdqa %ymm1,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) vmovdqa %ymm2,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) vmovdqa %ymm3,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) vpaddd CTR2BL(%rip),%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) vpaddd CTR4BL(%rip),%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) vmovdqa %ymm0,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) vmovdqa %ymm1,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) vmovdqa %ymm2,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) vmovdqa %ymm3,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) vmovdqa %ymm7,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) vmovdqa ROT8(%rip),%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) vmovdqa ROT16(%rip),%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) mov %rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) .Ldoubleround4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) vpxor %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) vpshufb %ymm9,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) vpaddd %ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) vpxor %ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) vpshufb %ymm9,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) vpxor %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) vmovdqa %ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) vpslld $12,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) vpsrld $20,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) vpor %ymm10,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) vpaddd %ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) vpxor %ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) vmovdqa %ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) vpslld $12,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) vpsrld $20,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) vpor %ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) vpxor %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) vpshufb %ymm8,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) vpaddd %ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) vpxor %ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) vpshufb %ymm8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) vpxor %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) vmovdqa %ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) vpslld $7,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) vpsrld $25,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) vpor %ymm10,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) vpaddd %ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) vpxor %ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) vmovdqa %ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) vpslld $7,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) vpsrld $25,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) vpor %ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) vpshufd $0x39,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) vpshufd $0x39,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) vpshufd $0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) vpshufd $0x4e,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) vpshufd $0x93,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) vpshufd $0x93,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) vpxor %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) vpshufb %ymm9,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) vpaddd %ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) vpxor %ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) vpshufb %ymm9,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) vpxor %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) vmovdqa %ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) vpslld $12,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) vpsrld $20,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) vpor %ymm10,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) vpaddd %ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) vpxor %ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) vmovdqa %ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) vpslld $12,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) vpsrld $20,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) vpor %ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) vpxor %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) vpshufb %ymm8,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) vpaddd %ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) vpxor %ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) vpshufb %ymm8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) vpxor %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) vmovdqa %ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) vpslld $7,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) vpsrld $25,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) vpor %ymm10,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) vpaddd %ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) vpxor %ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) vmovdqa %ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) vpslld $7,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) vpsrld $25,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) vpor %ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) vpshufd $0x93,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) vpshufd $0x93,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) vpshufd $0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) vpshufd $0x4e,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) vpshufd $0x39,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) vpshufd $0x39,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) sub $2,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) jnz .Ldoubleround4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) # o0 = i0 ^ (x0 + s0), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) vpaddd %ymm11,%ymm0,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) cmp $0x10,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) vpxor 0x00(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) vmovdqu %xmm9,0x00(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) vextracti128 $1,%ymm10,%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) # o1 = i1 ^ (x1 + s1), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) vpaddd %ymm12,%ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) cmp $0x20,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) vpxor 0x10(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) vmovdqu %xmm9,0x10(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) vextracti128 $1,%ymm10,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) # o2 = i2 ^ (x2 + s2), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) vpaddd %ymm13,%ymm2,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) cmp $0x30,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) vpxor 0x20(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) vmovdqu %xmm9,0x20(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) vextracti128 $1,%ymm10,%xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) # o3 = i3 ^ (x3 + s3), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) vpaddd %ymm14,%ymm3,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) cmp $0x40,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) vpxor 0x30(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) vmovdqu %xmm9,0x30(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) vextracti128 $1,%ymm10,%xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) # xor and write second block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) vmovdqa %xmm0,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) cmp $0x50,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) vpxor 0x40(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) vmovdqu %xmm9,0x40(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) vmovdqa %xmm1,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) cmp $0x60,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) vpxor 0x50(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) vmovdqu %xmm9,0x50(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) vmovdqa %xmm2,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) cmp $0x70,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) vpxor 0x60(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) vmovdqu %xmm9,0x60(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) vmovdqa %xmm3,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) cmp $0x80,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) vpxor 0x70(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) vmovdqu %xmm9,0x70(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) # o0 = i0 ^ (x0 + s0), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) vpaddd %ymm11,%ymm4,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) cmp $0x90,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) vpxor 0x80(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) vmovdqu %xmm9,0x80(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) vextracti128 $1,%ymm10,%xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) # o1 = i1 ^ (x1 + s1), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) vpaddd %ymm12,%ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) cmp $0xa0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) vpxor 0x90(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) vmovdqu %xmm9,0x90(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) vextracti128 $1,%ymm10,%xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) # o2 = i2 ^ (x2 + s2), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) vpaddd %ymm13,%ymm6,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) cmp $0xb0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) vpxor 0xa0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) vmovdqu %xmm9,0xa0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) vextracti128 $1,%ymm10,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) # o3 = i3 ^ (x3 + s3), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) vpaddd %ymm15,%ymm7,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) cmp $0xc0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) vpxor 0xb0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) vmovdqu %xmm9,0xb0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) vextracti128 $1,%ymm10,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) # xor and write fourth block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) vmovdqa %xmm4,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) cmp $0xd0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) vpxor 0xc0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) vmovdqu %xmm9,0xc0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) vmovdqa %xmm5,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) cmp $0xe0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) vpxor 0xd0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) vmovdqu %xmm9,0xd0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) vmovdqa %xmm6,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) cmp $0xf0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) vpxor 0xe0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) vmovdqu %xmm9,0xe0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) vmovdqa %xmm7,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) cmp $0x100,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) vpxor 0xf0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) vmovdqu %xmm9,0xf0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) .Ldone4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) .Lxorpart4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) # xor remaining bytes from partial register into output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) mov %rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) and $0x0f,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) jz .Ldone4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) and $~0x0f,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) mov %rsi,%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) lea 8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) sub $0x10,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) and $~31,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) lea (%rdx,%rax),%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) mov %rsp,%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) mov %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) vpxor 0x00(%rsp),%xmm10,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) vmovdqa %xmm10,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) mov %rsp,%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) lea (%r11,%rax),%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) mov %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) lea -8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) jmp .Ldone4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) SYM_FUNC_END(chacha_4block_xor_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) SYM_FUNC_START(chacha_8block_xor_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) # %rdi: Input state matrix, s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) # %rsi: up to 8 data blocks output, o
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) # %rdx: up to 8 data blocks input, i
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) # %rcx: input/output length in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) # %r8d: nrounds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) # This function encrypts eight consecutive ChaCha blocks by loading
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) # the state matrix in AVX registers eight times. As we need some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) # scratch registers, we save the first four registers on the stack. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) # algorithm performs each operation on the corresponding word of each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) # state matrix, hence requires no word shuffling. For final XORing step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) # we transpose the matrix by interleaving 32-, 64- and then 128-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) # words, which allows us to do XOR in AVX registers. 8/16-bit word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) # rotation is done with the slightly better performing byte shuffling,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) # 7/12-bit word rotation uses traditional shift+OR.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) # 4 * 32 byte stack, 32-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) lea 8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) and $~31, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) sub $0x80, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) mov %rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) # x0..15[0-7] = s[0..15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) vpbroadcastd 0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) vpbroadcastd 0x04(%rdi),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) vpbroadcastd 0x08(%rdi),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) vpbroadcastd 0x0c(%rdi),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) vpbroadcastd 0x10(%rdi),%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) vpbroadcastd 0x14(%rdi),%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) vpbroadcastd 0x18(%rdi),%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) vpbroadcastd 0x1c(%rdi),%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) vpbroadcastd 0x20(%rdi),%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) vpbroadcastd 0x24(%rdi),%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) vpbroadcastd 0x28(%rdi),%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) vpbroadcastd 0x2c(%rdi),%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) vpbroadcastd 0x30(%rdi),%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) vpbroadcastd 0x34(%rdi),%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) vpbroadcastd 0x38(%rdi),%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) vpbroadcastd 0x3c(%rdi),%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) # x0..3 on stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) vmovdqa %ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) vmovdqa %ymm1,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) vmovdqa %ymm2,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) vmovdqa %ymm3,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) vmovdqa CTRINC(%rip),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) vmovdqa ROT8(%rip),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) vmovdqa ROT16(%rip),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) # x12 += counter values 0-3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) vpaddd %ymm1,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) .Ldoubleround8:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) vpaddd 0x00(%rsp),%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) vmovdqa %ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) vpxor %ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) vpshufb %ymm3,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) vpaddd 0x20(%rsp),%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) vmovdqa %ymm0,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) vpxor %ymm0,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) vpshufb %ymm3,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) vpaddd 0x40(%rsp),%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) vmovdqa %ymm0,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) vpxor %ymm0,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) vpshufb %ymm3,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) vpaddd 0x60(%rsp),%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) vmovdqa %ymm0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) vpxor %ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) vpshufb %ymm3,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) vpaddd %ymm12,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) vpxor %ymm8,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) vpslld $12,%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) vpsrld $20,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) vpor %ymm0,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) vpaddd %ymm13,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) vpxor %ymm9,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) vpslld $12,%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) vpsrld $20,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) vpor %ymm0,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) vpaddd %ymm14,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) vpxor %ymm10,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) vpslld $12,%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) vpsrld $20,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) vpor %ymm0,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) vpaddd %ymm15,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) vpxor %ymm11,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) vpslld $12,%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) vpsrld $20,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) vpor %ymm0,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) vpaddd 0x00(%rsp),%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) vmovdqa %ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) vpxor %ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) vpshufb %ymm2,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) vpaddd 0x20(%rsp),%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) vmovdqa %ymm0,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) vpxor %ymm0,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) vpshufb %ymm2,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) vpaddd 0x40(%rsp),%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) vmovdqa %ymm0,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) vpxor %ymm0,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) vpshufb %ymm2,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) vpaddd 0x60(%rsp),%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) vmovdqa %ymm0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) vpxor %ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) vpshufb %ymm2,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) vpaddd %ymm12,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) vpxor %ymm8,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) vpslld $7,%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) vpsrld $25,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) vpor %ymm0,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) vpaddd %ymm13,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) vpxor %ymm9,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) vpslld $7,%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) vpsrld $25,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) vpor %ymm0,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) vpaddd %ymm14,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) vpxor %ymm10,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) vpslld $7,%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) vpsrld $25,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) vpor %ymm0,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) vpaddd %ymm15,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) vpxor %ymm11,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) vpslld $7,%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) vpsrld $25,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) vpor %ymm0,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) vpaddd 0x00(%rsp),%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) vmovdqa %ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) vpxor %ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) vpshufb %ymm3,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) vpaddd 0x20(%rsp),%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) vmovdqa %ymm0,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) vpxor %ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) vpshufb %ymm3,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) vpaddd 0x40(%rsp),%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) vmovdqa %ymm0,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) vpxor %ymm0,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) vpshufb %ymm3,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) vpaddd 0x60(%rsp),%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) vmovdqa %ymm0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) vpxor %ymm0,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) vpshufb %ymm3,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) vpaddd %ymm15,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) vpxor %ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) vpslld $12,%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) vpsrld $20,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) vpor %ymm0,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) vpaddd %ymm12,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) vpxor %ymm11,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) vpslld $12,%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) vpsrld $20,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) vpor %ymm0,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) vpaddd %ymm13,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) vpxor %ymm8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) vpslld $12,%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) vpsrld $20,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) vpor %ymm0,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) vpaddd %ymm14,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) vpxor %ymm9,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) vpslld $12,%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) vpsrld $20,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) vpor %ymm0,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) vpaddd 0x00(%rsp),%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) vmovdqa %ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) vpxor %ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) vpshufb %ymm2,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) vpaddd 0x20(%rsp),%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) vmovdqa %ymm0,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) vpxor %ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) vpshufb %ymm2,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) vpaddd 0x40(%rsp),%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) vmovdqa %ymm0,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) vpxor %ymm0,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) vpshufb %ymm2,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) vpaddd 0x60(%rsp),%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) vmovdqa %ymm0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) vpxor %ymm0,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) vpshufb %ymm2,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) vpaddd %ymm15,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) vpxor %ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) vpslld $7,%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) vpsrld $25,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) vpor %ymm0,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) vpaddd %ymm12,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) vpxor %ymm11,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) vpslld $7,%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) vpsrld $25,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) vpor %ymm0,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) vpaddd %ymm13,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) vpxor %ymm8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) vpslld $7,%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) vpsrld $25,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) vpor %ymm0,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) vpaddd %ymm14,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) vpxor %ymm9,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) vpslld $7,%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) vpsrld $25,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) vpor %ymm0,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) sub $2,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) jnz .Ldoubleround8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) # x0..15[0-3] += s[0..15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) vpbroadcastd 0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) vpaddd 0x00(%rsp),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) vmovdqa %ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) vpbroadcastd 0x04(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) vpaddd 0x20(%rsp),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) vmovdqa %ymm0,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) vpbroadcastd 0x08(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) vpaddd 0x40(%rsp),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) vmovdqa %ymm0,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) vpbroadcastd 0x0c(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) vpaddd 0x60(%rsp),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) vmovdqa %ymm0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) vpbroadcastd 0x10(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) vpaddd %ymm0,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) vpbroadcastd 0x14(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) vpaddd %ymm0,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) vpbroadcastd 0x18(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) vpaddd %ymm0,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) vpbroadcastd 0x1c(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) vpaddd %ymm0,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) vpbroadcastd 0x20(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) vpaddd %ymm0,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) vpbroadcastd 0x24(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) vpaddd %ymm0,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) vpbroadcastd 0x28(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) vpaddd %ymm0,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) vpbroadcastd 0x2c(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) vpaddd %ymm0,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) vpbroadcastd 0x30(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) vpaddd %ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) vpbroadcastd 0x34(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) vpaddd %ymm0,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) vpbroadcastd 0x38(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) vpaddd %ymm0,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) vpbroadcastd 0x3c(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) vpaddd %ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) # x12 += counter values 0-3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) vpaddd %ymm1,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) # interleave 32-bit words in state n, n+1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) vmovdqa 0x00(%rsp),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) vmovdqa 0x20(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) vpunpckldq %ymm1,%ymm0,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) vpunpckhdq %ymm1,%ymm0,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) vmovdqa %ymm2,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) vmovdqa %ymm1,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) vmovdqa 0x40(%rsp),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) vmovdqa 0x60(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) vpunpckldq %ymm1,%ymm0,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) vpunpckhdq %ymm1,%ymm0,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) vmovdqa %ymm2,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) vmovdqa %ymm1,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) vmovdqa %ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) vpunpckldq %ymm5,%ymm0,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) vpunpckhdq %ymm5,%ymm0,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) vmovdqa %ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) vpunpckldq %ymm7,%ymm0,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) vpunpckhdq %ymm7,%ymm0,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) vmovdqa %ymm8,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) vpunpckldq %ymm9,%ymm0,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) vpunpckhdq %ymm9,%ymm0,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) vmovdqa %ymm10,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) vpunpckldq %ymm11,%ymm0,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) vpunpckhdq %ymm11,%ymm0,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) vmovdqa %ymm12,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) vpunpckldq %ymm13,%ymm0,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) vpunpckhdq %ymm13,%ymm0,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) vmovdqa %ymm14,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) vpunpckldq %ymm15,%ymm0,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) vpunpckhdq %ymm15,%ymm0,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) # interleave 64-bit words in state n, n+2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) vmovdqa 0x00(%rsp),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) vmovdqa 0x40(%rsp),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) vpunpcklqdq %ymm2,%ymm0,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) vpunpckhqdq %ymm2,%ymm0,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) vmovdqa %ymm1,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) vmovdqa %ymm2,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) vmovdqa 0x20(%rsp),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) vmovdqa 0x60(%rsp),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) vpunpcklqdq %ymm2,%ymm0,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) vpunpckhqdq %ymm2,%ymm0,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) vmovdqa %ymm1,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) vmovdqa %ymm2,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) vmovdqa %ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) vpunpcklqdq %ymm6,%ymm0,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) vpunpckhqdq %ymm6,%ymm0,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) vmovdqa %ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) vpunpcklqdq %ymm7,%ymm0,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) vpunpckhqdq %ymm7,%ymm0,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) vmovdqa %ymm8,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) vpunpcklqdq %ymm10,%ymm0,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) vpunpckhqdq %ymm10,%ymm0,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) vmovdqa %ymm9,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) vpunpcklqdq %ymm11,%ymm0,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) vpunpckhqdq %ymm11,%ymm0,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) vmovdqa %ymm12,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) vpunpcklqdq %ymm14,%ymm0,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) vpunpckhqdq %ymm14,%ymm0,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) vmovdqa %ymm13,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) vpunpcklqdq %ymm15,%ymm0,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) vpunpckhqdq %ymm15,%ymm0,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) # interleave 128-bit words in state n, n+4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) # xor/write first four blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) vmovdqa 0x00(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) cmp $0x0020,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) vpxor 0x0000(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) vmovdqu %ymm0,0x0000(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) cmp $0x0040,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) vpxor 0x0020(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) vmovdqu %ymm0,0x0020(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) vmovdqa 0x40(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) cmp $0x0060,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) vpxor 0x0040(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) vmovdqu %ymm0,0x0040(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) cmp $0x0080,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) vpxor 0x0060(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) vmovdqu %ymm0,0x0060(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) vmovdqa 0x20(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) cmp $0x00a0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) vpxor 0x0080(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) vmovdqu %ymm0,0x0080(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) cmp $0x00c0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) vpxor 0x00a0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) vmovdqu %ymm0,0x00a0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) vmovdqa 0x60(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) cmp $0x00e0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) vpxor 0x00c0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) vmovdqu %ymm0,0x00c0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) cmp $0x0100,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) vpxor 0x00e0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) vmovdqu %ymm0,0x00e0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) # xor remaining blocks, write to output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) vmovdqa %ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) cmp $0x0120,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) vpxor 0x0100(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) vmovdqu %ymm0,0x0100(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) vmovdqa %ymm12,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) cmp $0x0140,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) vpxor 0x0120(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) vmovdqu %ymm0,0x0120(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) vmovdqa %ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) cmp $0x0160,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) vpxor 0x0140(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) vmovdqu %ymm0,0x0140(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) vmovdqa %ymm14,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) cmp $0x0180,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) vpxor 0x0160(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) vmovdqu %ymm0,0x0160(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) vmovdqa %ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) cmp $0x01a0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) vpxor 0x0180(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) vmovdqu %ymm0,0x0180(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) vmovdqa %ymm13,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) cmp $0x01c0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) vpxor 0x01a0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) vmovdqu %ymm0,0x01a0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) vmovdqa %ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) cmp $0x01e0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) vpxor 0x01c0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) vmovdqu %ymm0,0x01c0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) vmovdqa %ymm15,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) cmp $0x0200,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) vpxor 0x01e0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) vmovdqu %ymm0,0x01e0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) .Ldone8:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) lea -8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) .Lxorpart8:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) # xor remaining bytes from partial register into output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) mov %rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) and $0x1f,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) jz .Ldone8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) and $~0x1f,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) mov %rsi,%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) lea (%rdx,%rax),%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) mov %rsp,%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) mov %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) vpxor 0x00(%rsp),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) vmovdqa %ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) mov %rsp,%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) lea (%r11,%rax),%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) mov %r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) jmp .Ldone8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) SYM_FUNC_END(chacha_8block_xor_avx2)