^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0+ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2018 Martin Willi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) CTR2BL: .octa 0x00000000000000000000000000000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) .octa 0x00000000000000000000000000000001
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) CTR4BL: .octa 0x00000000000000000000000000000002
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) .octa 0x00000000000000000000000000000003
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) .section .rodata.cst32.CTR8BL, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) CTR8BL: .octa 0x00000003000000020000000100000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) .octa 0x00000007000000060000000500000004
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) SYM_FUNC_START(chacha_2block_xor_avx512vl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) # %rdi: Input state matrix, s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) # %rsi: up to 2 data blocks output, o
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) # %rdx: up to 2 data blocks input, i
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) # %rcx: input/output length in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) # %r8d: nrounds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) # This function encrypts two ChaCha blocks by loading the state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) # matrix twice across four AVX registers. It performs matrix operations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) # on four words in each matrix in parallel, but requires shuffling to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) # rearrange the words after each round.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) # x0..3[0-2] = s0..3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) vbroadcasti128 0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) vbroadcasti128 0x10(%rdi),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) vbroadcasti128 0x20(%rdi),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) vbroadcasti128 0x30(%rdi),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) vpaddd CTR2BL(%rip),%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) vmovdqa %ymm0,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) vmovdqa %ymm1,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) vmovdqa %ymm2,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) vmovdqa %ymm3,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) .Ldoubleround:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) vpxord %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) vprold $16,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) vpxord %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) vprold $12,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) vpxord %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) vprold $8,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) vpxord %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) vprold $7,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) vpshufd $0x39,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) vpshufd $0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) vpshufd $0x93,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) vpxord %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) vprold $16,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) vpxord %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) vprold $12,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) vpxord %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) vprold $8,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) vpxord %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) vprold $7,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) vpshufd $0x93,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) vpshufd $0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) vpshufd $0x39,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) sub $2,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) jnz .Ldoubleround
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) # o0 = i0 ^ (x0 + s0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) vpaddd %ymm8,%ymm0,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) cmp $0x10,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) vpxord 0x00(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) vmovdqu %xmm6,0x00(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) vextracti128 $1,%ymm7,%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) # o1 = i1 ^ (x1 + s1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) vpaddd %ymm9,%ymm1,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) cmp $0x20,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) vpxord 0x10(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) vmovdqu %xmm6,0x10(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) vextracti128 $1,%ymm7,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) # o2 = i2 ^ (x2 + s2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) vpaddd %ymm10,%ymm2,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) cmp $0x30,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) vpxord 0x20(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) vmovdqu %xmm6,0x20(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) vextracti128 $1,%ymm7,%xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) # o3 = i3 ^ (x3 + s3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) vpaddd %ymm11,%ymm3,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) cmp $0x40,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) vpxord 0x30(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) vmovdqu %xmm6,0x30(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) vextracti128 $1,%ymm7,%xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) # xor and write second block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) vmovdqa %xmm0,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) cmp $0x50,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) vpxord 0x40(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) vmovdqu %xmm6,0x40(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) vmovdqa %xmm1,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) cmp $0x60,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) vpxord 0x50(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) vmovdqu %xmm6,0x50(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) vmovdqa %xmm2,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) cmp $0x70,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) vpxord 0x60(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) vmovdqu %xmm6,0x60(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) vmovdqa %xmm3,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) cmp $0x80,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) jl .Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) vpxord 0x70(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) vmovdqu %xmm6,0x70(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) .Ldone2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) .Lxorpart2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) # xor remaining bytes from partial register into output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) mov %rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) and $0xf,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) jz .Ldone8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) mov %rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) and $~0xf,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) mov $1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) shld %cl,%rax,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) sub $1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) kmovq %rax,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) vpxord %xmm7,%xmm1,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) vmovdqu8 %xmm1,(%rsi,%r9){%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) jmp .Ldone2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) SYM_FUNC_END(chacha_2block_xor_avx512vl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) SYM_FUNC_START(chacha_4block_xor_avx512vl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) # %rdi: Input state matrix, s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) # %rsi: up to 4 data blocks output, o
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) # %rdx: up to 4 data blocks input, i
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) # %rcx: input/output length in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) # %r8d: nrounds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) # This function encrypts four ChaCha blocks by loading the state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) # matrix four times across eight AVX registers. It performs matrix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) # operations on four words in two matrices in parallel, sequentially
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) # to the operations on the four words of the other two matrices. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) # required word shuffling has a rather high latency, we can do the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) # arithmetic on two matrix-pairs without much slowdown.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) # x0..3[0-4] = s0..3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) vbroadcasti128 0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) vbroadcasti128 0x10(%rdi),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) vbroadcasti128 0x20(%rdi),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) vbroadcasti128 0x30(%rdi),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) vmovdqa %ymm0,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) vmovdqa %ymm1,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) vmovdqa %ymm2,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) vmovdqa %ymm3,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) vpaddd CTR2BL(%rip),%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) vpaddd CTR4BL(%rip),%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) vmovdqa %ymm0,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) vmovdqa %ymm1,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) vmovdqa %ymm2,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) vmovdqa %ymm3,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) vmovdqa %ymm7,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) .Ldoubleround4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) vpxord %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) vprold $16,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) vpaddd %ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) vpxord %ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) vprold $16,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) vpxord %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) vprold $12,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) vpaddd %ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) vpxord %ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) vprold $12,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) vpxord %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) vprold $8,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) vpaddd %ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) vpxord %ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) vprold $8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) vpxord %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) vprold $7,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) vpaddd %ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) vpxord %ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) vprold $7,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) vpshufd $0x39,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) vpshufd $0x39,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) vpshufd $0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) vpshufd $0x4e,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) vpshufd $0x93,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) vpshufd $0x93,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) vpxord %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) vprold $16,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) vpaddd %ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) vpxord %ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) vprold $16,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) vpxord %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) vprold $12,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) vpaddd %ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) vpxord %ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) vprold $12,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) vpaddd %ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) vpxord %ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) vprold $8,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) vpaddd %ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) vpxord %ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) vprold $8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) vpaddd %ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) vpxord %ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) vprold $7,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) vpaddd %ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) vpxord %ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) vprold $7,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) vpshufd $0x93,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) vpshufd $0x93,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) vpshufd $0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) vpshufd $0x4e,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) vpshufd $0x39,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) vpshufd $0x39,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) sub $2,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) jnz .Ldoubleround4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) # o0 = i0 ^ (x0 + s0), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) vpaddd %ymm11,%ymm0,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) cmp $0x10,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) vpxord 0x00(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) vmovdqu %xmm9,0x00(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) vextracti128 $1,%ymm10,%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) # o1 = i1 ^ (x1 + s1), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) vpaddd %ymm12,%ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) cmp $0x20,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) vpxord 0x10(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) vmovdqu %xmm9,0x10(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) vextracti128 $1,%ymm10,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) # o2 = i2 ^ (x2 + s2), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) vpaddd %ymm13,%ymm2,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) cmp $0x30,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) vpxord 0x20(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) vmovdqu %xmm9,0x20(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) vextracti128 $1,%ymm10,%xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) # o3 = i3 ^ (x3 + s3), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) vpaddd %ymm14,%ymm3,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) cmp $0x40,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) vpxord 0x30(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) vmovdqu %xmm9,0x30(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) vextracti128 $1,%ymm10,%xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) # xor and write second block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) vmovdqa %xmm0,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) cmp $0x50,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) vpxord 0x40(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) vmovdqu %xmm9,0x40(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) vmovdqa %xmm1,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) cmp $0x60,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) vpxord 0x50(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) vmovdqu %xmm9,0x50(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) vmovdqa %xmm2,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) cmp $0x70,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) vpxord 0x60(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) vmovdqu %xmm9,0x60(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) vmovdqa %xmm3,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) cmp $0x80,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) vpxord 0x70(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) vmovdqu %xmm9,0x70(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) # o0 = i0 ^ (x0 + s0), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) vpaddd %ymm11,%ymm4,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) cmp $0x90,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) vpxord 0x80(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) vmovdqu %xmm9,0x80(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) vextracti128 $1,%ymm10,%xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) # o1 = i1 ^ (x1 + s1), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) vpaddd %ymm12,%ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) cmp $0xa0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) vpxord 0x90(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) vmovdqu %xmm9,0x90(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) vextracti128 $1,%ymm10,%xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) # o2 = i2 ^ (x2 + s2), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) vpaddd %ymm13,%ymm6,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) cmp $0xb0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) vpxord 0xa0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) vmovdqu %xmm9,0xa0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) vextracti128 $1,%ymm10,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) # o3 = i3 ^ (x3 + s3), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) vpaddd %ymm15,%ymm7,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) cmp $0xc0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) vpxord 0xb0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) vmovdqu %xmm9,0xb0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) vextracti128 $1,%ymm10,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) # xor and write fourth block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) vmovdqa %xmm4,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) cmp $0xd0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) vpxord 0xc0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) vmovdqu %xmm9,0xc0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) vmovdqa %xmm5,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) cmp $0xe0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) vpxord 0xd0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) vmovdqu %xmm9,0xd0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) vmovdqa %xmm6,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) cmp $0xf0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) vpxord 0xe0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) vmovdqu %xmm9,0xe0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) vmovdqa %xmm7,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) cmp $0x100,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) jl .Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) vpxord 0xf0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) vmovdqu %xmm9,0xf0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) .Ldone4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) .Lxorpart4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) # xor remaining bytes from partial register into output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) mov %rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) and $0xf,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) jz .Ldone8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) mov %rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) and $~0xf,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) mov $1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) shld %cl,%rax,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) sub $1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) kmovq %rax,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) vpxord %xmm10,%xmm1,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) vmovdqu8 %xmm1,(%rsi,%r9){%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) jmp .Ldone4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) SYM_FUNC_END(chacha_4block_xor_avx512vl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) SYM_FUNC_START(chacha_8block_xor_avx512vl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) # %rdi: Input state matrix, s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) # %rsi: up to 8 data blocks output, o
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) # %rdx: up to 8 data blocks input, i
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) # %rcx: input/output length in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) # %r8d: nrounds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) # This function encrypts eight consecutive ChaCha blocks by loading
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) # the state matrix in AVX registers eight times. Compared to AVX2, this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) # mostly benefits from the new rotate instructions in VL and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) # additional registers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) # x0..15[0-7] = s[0..15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) vpbroadcastd 0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) vpbroadcastd 0x04(%rdi),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) vpbroadcastd 0x08(%rdi),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) vpbroadcastd 0x0c(%rdi),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) vpbroadcastd 0x10(%rdi),%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) vpbroadcastd 0x14(%rdi),%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) vpbroadcastd 0x18(%rdi),%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) vpbroadcastd 0x1c(%rdi),%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) vpbroadcastd 0x20(%rdi),%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) vpbroadcastd 0x24(%rdi),%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) vpbroadcastd 0x28(%rdi),%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) vpbroadcastd 0x2c(%rdi),%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) vpbroadcastd 0x30(%rdi),%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) vpbroadcastd 0x34(%rdi),%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) vpbroadcastd 0x38(%rdi),%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) vpbroadcastd 0x3c(%rdi),%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) # x12 += counter values 0-3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) vpaddd CTR8BL(%rip),%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) vmovdqa64 %ymm0,%ymm16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) vmovdqa64 %ymm1,%ymm17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) vmovdqa64 %ymm2,%ymm18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) vmovdqa64 %ymm3,%ymm19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) vmovdqa64 %ymm4,%ymm20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) vmovdqa64 %ymm5,%ymm21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) vmovdqa64 %ymm6,%ymm22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) vmovdqa64 %ymm7,%ymm23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) vmovdqa64 %ymm8,%ymm24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) vmovdqa64 %ymm9,%ymm25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) vmovdqa64 %ymm10,%ymm26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) vmovdqa64 %ymm11,%ymm27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) vmovdqa64 %ymm12,%ymm28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) vmovdqa64 %ymm13,%ymm29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) vmovdqa64 %ymm14,%ymm30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) vmovdqa64 %ymm15,%ymm31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) .Ldoubleround8:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) vpaddd %ymm0,%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) vpxord %ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) vprold $16,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) vpaddd %ymm1,%ymm5,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) vpxord %ymm1,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) vprold $16,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) vpaddd %ymm2,%ymm6,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) vpxord %ymm2,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) vprold $16,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) vpaddd %ymm3,%ymm7,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) vpxord %ymm3,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) vprold $16,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) vpaddd %ymm12,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) vpxord %ymm8,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) vprold $12,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) vpaddd %ymm13,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) vpxord %ymm9,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) vprold $12,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) vpaddd %ymm14,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) vpxord %ymm10,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) vprold $12,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) vpaddd %ymm15,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) vpxord %ymm11,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) vprold $12,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) vpaddd %ymm0,%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) vpxord %ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) vprold $8,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) vpaddd %ymm1,%ymm5,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) vpxord %ymm1,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) vprold $8,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) vpaddd %ymm2,%ymm6,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) vpxord %ymm2,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) vprold $8,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) vpaddd %ymm3,%ymm7,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) vpxord %ymm3,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) vprold $8,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) vpaddd %ymm12,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) vpxord %ymm8,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) vprold $7,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) vpaddd %ymm13,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) vpxord %ymm9,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) vprold $7,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) vpaddd %ymm14,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) vpxord %ymm10,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) vprold $7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) vpaddd %ymm15,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) vpxord %ymm11,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) vprold $7,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) vpaddd %ymm0,%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) vpxord %ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) vprold $16,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) vpaddd %ymm1,%ymm6,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) vpxord %ymm1,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) vprold $16,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) vpaddd %ymm2,%ymm7,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) vpxord %ymm2,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) vprold $16,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) vpaddd %ymm3,%ymm4,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) vpxord %ymm3,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) vprold $16,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) vpaddd %ymm15,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) vpxord %ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) vprold $12,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) vpaddd %ymm12,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) vpxord %ymm11,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) vprold $12,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) vpaddd %ymm13,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) vpxord %ymm8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) vprold $12,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) vpaddd %ymm14,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) vpxord %ymm9,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) vprold $12,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) vpaddd %ymm0,%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) vpxord %ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) vprold $8,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) vpaddd %ymm1,%ymm6,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) vpxord %ymm1,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) vprold $8,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) vpaddd %ymm2,%ymm7,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) vpxord %ymm2,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) vprold $8,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) vpaddd %ymm3,%ymm4,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) vpxord %ymm3,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) vprold $8,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) vpaddd %ymm15,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) vpxord %ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) vprold $7,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) vpaddd %ymm12,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) vpxord %ymm11,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) vprold $7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) vpaddd %ymm13,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) vpxord %ymm8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) vprold $7,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) vpaddd %ymm14,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) vpxord %ymm9,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) vprold $7,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) sub $2,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) jnz .Ldoubleround8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) # x0..15[0-3] += s[0..15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) vpaddd %ymm16,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) vpaddd %ymm17,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) vpaddd %ymm18,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) vpaddd %ymm19,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) vpaddd %ymm20,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) vpaddd %ymm21,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) vpaddd %ymm22,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) vpaddd %ymm23,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) vpaddd %ymm24,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) vpaddd %ymm25,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) vpaddd %ymm26,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) vpaddd %ymm27,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) vpaddd %ymm28,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) vpaddd %ymm29,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) vpaddd %ymm30,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) vpaddd %ymm31,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) # interleave 32-bit words in state n, n+1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) vpunpckldq %ymm1,%ymm0,%ymm16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) vpunpckhdq %ymm1,%ymm0,%ymm17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) vpunpckldq %ymm3,%ymm2,%ymm18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) vpunpckhdq %ymm3,%ymm2,%ymm19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) vpunpckldq %ymm5,%ymm4,%ymm20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) vpunpckhdq %ymm5,%ymm4,%ymm21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) vpunpckldq %ymm7,%ymm6,%ymm22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) vpunpckhdq %ymm7,%ymm6,%ymm23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) vpunpckldq %ymm9,%ymm8,%ymm24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) vpunpckhdq %ymm9,%ymm8,%ymm25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) vpunpckldq %ymm11,%ymm10,%ymm26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) vpunpckhdq %ymm11,%ymm10,%ymm27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) vpunpckldq %ymm13,%ymm12,%ymm28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) vpunpckhdq %ymm13,%ymm12,%ymm29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) vpunpckldq %ymm15,%ymm14,%ymm30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) vpunpckhdq %ymm15,%ymm14,%ymm31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) # interleave 64-bit words in state n, n+2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) vpunpcklqdq %ymm18,%ymm16,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) vpunpcklqdq %ymm19,%ymm17,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) vpunpckhqdq %ymm18,%ymm16,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) vpunpckhqdq %ymm19,%ymm17,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) vpunpcklqdq %ymm22,%ymm20,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) vpunpcklqdq %ymm23,%ymm21,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) vpunpckhqdq %ymm22,%ymm20,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) vpunpckhqdq %ymm23,%ymm21,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) vpunpcklqdq %ymm26,%ymm24,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) vpunpcklqdq %ymm27,%ymm25,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) vpunpckhqdq %ymm26,%ymm24,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) vpunpckhqdq %ymm27,%ymm25,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) vpunpcklqdq %ymm30,%ymm28,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) vpunpcklqdq %ymm31,%ymm29,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) vpunpckhqdq %ymm30,%ymm28,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) vpunpckhqdq %ymm31,%ymm29,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) # interleave 128-bit words in state n, n+4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) # xor/write first four blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) vmovdqa64 %ymm0,%ymm16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) cmp $0x0020,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) vpxord 0x0000(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) vmovdqu64 %ymm0,0x0000(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) vmovdqa64 %ymm16,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) cmp $0x0040,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) vpxord 0x0020(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) vmovdqu64 %ymm0,0x0020(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) cmp $0x0060,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) vpxord 0x0040(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) vmovdqu64 %ymm0,0x0040(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) cmp $0x0080,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) vpxord 0x0060(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) vmovdqu64 %ymm0,0x0060(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) cmp $0x00a0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) vpxord 0x0080(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) vmovdqu64 %ymm0,0x0080(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) cmp $0x00c0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) vpxord 0x00a0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) vmovdqu64 %ymm0,0x00a0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) cmp $0x00e0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) vpxord 0x00c0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) vmovdqu64 %ymm0,0x00c0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) cmp $0x0100,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) vpxord 0x00e0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) vmovdqu64 %ymm0,0x00e0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) # xor remaining blocks, write to output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) vmovdqa64 %ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) cmp $0x0120,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) vpxord 0x0100(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) vmovdqu64 %ymm0,0x0100(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) vmovdqa64 %ymm12,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) cmp $0x0140,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) vpxord 0x0120(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) vmovdqu64 %ymm0,0x0120(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) vmovdqa64 %ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) cmp $0x0160,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) vpxord 0x0140(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) vmovdqu64 %ymm0,0x0140(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) vmovdqa64 %ymm14,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) cmp $0x0180,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) vpxord 0x0160(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) vmovdqu64 %ymm0,0x0160(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) vmovdqa64 %ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) cmp $0x01a0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) vpxord 0x0180(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) vmovdqu64 %ymm0,0x0180(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) vmovdqa64 %ymm13,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) cmp $0x01c0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) vpxord 0x01a0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) vmovdqu64 %ymm0,0x01a0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) vmovdqa64 %ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) cmp $0x01e0,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) vpxord 0x01c0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) vmovdqu64 %ymm0,0x01c0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) vmovdqa64 %ymm15,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) cmp $0x0200,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) jl .Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) vpxord 0x01e0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) vmovdqu64 %ymm0,0x01e0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) .Ldone8:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) .Lxorpart8:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) # xor remaining bytes from partial register into output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) mov %rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) and $0x1f,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) jz .Ldone8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) mov %rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) and $~0x1f,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) mov $1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) shld %cl,%rax,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) sub $1,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) kmovq %rax,%k1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) vpxord %ymm0,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) vmovdqu8 %ymm1,(%rsi,%r9){%k1}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) jmp .Ldone8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) SYM_FUNC_END(chacha_8block_xor_avx512vl)