Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    1) /* SPDX-License-Identifier: GPL-2.0-or-later */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    3)  * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    4)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    5)  * Copyright (C) 2015 Martin Willi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    6)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    7) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    8) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    9) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   10) .section	.rodata.cst32.ROT8, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   11) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   12) ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   13) 	.octa 0x0e0d0c0f0a09080b0605040702010003
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   14) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   15) .section	.rodata.cst32.ROT16, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   16) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   17) ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   18) 	.octa 0x0d0c0f0e09080b0a0504070601000302
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   19) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   20) .section	.rodata.cst32.CTRINC, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   21) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   22) CTRINC:	.octa 0x00000003000000020000000100000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   23) 	.octa 0x00000007000000060000000500000004
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   24) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   25) .section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   26) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   27) CTR2BL:	.octa 0x00000000000000000000000000000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   28) 	.octa 0x00000000000000000000000000000001
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   29) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   30) .section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   31) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   32) CTR4BL:	.octa 0x00000000000000000000000000000002
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   33) 	.octa 0x00000000000000000000000000000003
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   34) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   35) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   36) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   37) SYM_FUNC_START(chacha_2block_xor_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   38) 	# %rdi: Input state matrix, s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   39) 	# %rsi: up to 2 data blocks output, o
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   40) 	# %rdx: up to 2 data blocks input, i
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   41) 	# %rcx: input/output length in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   42) 	# %r8d: nrounds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   43) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   44) 	# This function encrypts two ChaCha blocks by loading the state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   45) 	# matrix twice across four AVX registers. It performs matrix operations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   46) 	# on four words in each matrix in parallel, but requires shuffling to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   47) 	# rearrange the words after each round.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   48) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   49) 	vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   50) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   51) 	# x0..3[0-2] = s0..3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   52) 	vbroadcasti128	0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   53) 	vbroadcasti128	0x10(%rdi),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   54) 	vbroadcasti128	0x20(%rdi),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   55) 	vbroadcasti128	0x30(%rdi),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   56) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   57) 	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   58) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   59) 	vmovdqa		%ymm0,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   60) 	vmovdqa		%ymm1,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   61) 	vmovdqa		%ymm2,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   62) 	vmovdqa		%ymm3,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   63) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   64) 	vmovdqa		ROT8(%rip),%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   65) 	vmovdqa		ROT16(%rip),%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   66) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   67) 	mov		%rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   68) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   69) .Ldoubleround:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   70) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   71) 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   72) 	vpaddd		%ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   73) 	vpxor		%ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   74) 	vpshufb		%ymm5,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   75) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   76) 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   77) 	vpaddd		%ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   78) 	vpxor		%ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   79) 	vmovdqa		%ymm1,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   80) 	vpslld		$12,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   81) 	vpsrld		$20,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   82) 	vpor		%ymm6,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   83) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   84) 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   85) 	vpaddd		%ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   86) 	vpxor		%ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   87) 	vpshufb		%ymm4,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   88) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   89) 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   90) 	vpaddd		%ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   91) 	vpxor		%ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   92) 	vmovdqa		%ymm1,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   93) 	vpslld		$7,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   94) 	vpsrld		$25,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   95) 	vpor		%ymm7,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   96) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   97) 	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   98) 	vpshufd		$0x39,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   99) 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  100) 	vpshufd		$0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  101) 	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  102) 	vpshufd		$0x93,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  103) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  104) 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  105) 	vpaddd		%ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  106) 	vpxor		%ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  107) 	vpshufb		%ymm5,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  108) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  109) 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  110) 	vpaddd		%ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  111) 	vpxor		%ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  112) 	vmovdqa		%ymm1,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  113) 	vpslld		$12,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  114) 	vpsrld		$20,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  115) 	vpor		%ymm6,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  116) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  117) 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  118) 	vpaddd		%ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  119) 	vpxor		%ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  120) 	vpshufb		%ymm4,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  122) 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  123) 	vpaddd		%ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  124) 	vpxor		%ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  125) 	vmovdqa		%ymm1,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  126) 	vpslld		$7,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  127) 	vpsrld		$25,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  128) 	vpor		%ymm7,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  129) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  130) 	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  131) 	vpshufd		$0x93,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  132) 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  133) 	vpshufd		$0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  134) 	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  135) 	vpshufd		$0x39,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  136) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  137) 	sub		$2,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  138) 	jnz		.Ldoubleround
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  139) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  140) 	# o0 = i0 ^ (x0 + s0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  141) 	vpaddd		%ymm8,%ymm0,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  142) 	cmp		$0x10,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  143) 	jl		.Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  144) 	vpxor		0x00(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  145) 	vmovdqu		%xmm6,0x00(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  146) 	vextracti128	$1,%ymm7,%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  147) 	# o1 = i1 ^ (x1 + s1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  148) 	vpaddd		%ymm9,%ymm1,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  149) 	cmp		$0x20,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  150) 	jl		.Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  151) 	vpxor		0x10(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  152) 	vmovdqu		%xmm6,0x10(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  153) 	vextracti128	$1,%ymm7,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  154) 	# o2 = i2 ^ (x2 + s2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  155) 	vpaddd		%ymm10,%ymm2,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  156) 	cmp		$0x30,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  157) 	jl		.Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  158) 	vpxor		0x20(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  159) 	vmovdqu		%xmm6,0x20(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  160) 	vextracti128	$1,%ymm7,%xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  161) 	# o3 = i3 ^ (x3 + s3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  162) 	vpaddd		%ymm11,%ymm3,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  163) 	cmp		$0x40,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  164) 	jl		.Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  165) 	vpxor		0x30(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  166) 	vmovdqu		%xmm6,0x30(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  167) 	vextracti128	$1,%ymm7,%xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  168) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  169) 	# xor and write second block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  170) 	vmovdqa		%xmm0,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  171) 	cmp		$0x50,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  172) 	jl		.Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  173) 	vpxor		0x40(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  174) 	vmovdqu		%xmm6,0x40(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  175) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  176) 	vmovdqa		%xmm1,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  177) 	cmp		$0x60,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  178) 	jl		.Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  179) 	vpxor		0x50(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  180) 	vmovdqu		%xmm6,0x50(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  181) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  182) 	vmovdqa		%xmm2,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  183) 	cmp		$0x70,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  184) 	jl		.Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  185) 	vpxor		0x60(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  186) 	vmovdqu		%xmm6,0x60(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  187) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  188) 	vmovdqa		%xmm3,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  189) 	cmp		$0x80,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  190) 	jl		.Lxorpart2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  191) 	vpxor		0x70(%rdx),%xmm7,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  192) 	vmovdqu		%xmm6,0x70(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  193) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  194) .Ldone2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  195) 	vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  196) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  197) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  198) .Lxorpart2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  199) 	# xor remaining bytes from partial register into output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  200) 	mov		%rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  201) 	and		$0x0f,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  202) 	jz		.Ldone2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  203) 	and		$~0x0f,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  204) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  205) 	mov		%rsi,%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  206) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  207) 	lea		8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  208) 	sub		$0x10,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  209) 	and		$~31,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  210) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  211) 	lea		(%rdx,%rax),%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  212) 	mov		%rsp,%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  213) 	mov		%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  214) 	rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  215) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  216) 	vpxor		0x00(%rsp),%xmm7,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  217) 	vmovdqa		%xmm7,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  218) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  219) 	mov		%rsp,%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  220) 	lea		(%r11,%rax),%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  221) 	mov		%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  222) 	rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  223) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  224) 	lea		-8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  225) 	jmp		.Ldone2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  226) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  227) SYM_FUNC_END(chacha_2block_xor_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  228) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  229) SYM_FUNC_START(chacha_4block_xor_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  230) 	# %rdi: Input state matrix, s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  231) 	# %rsi: up to 4 data blocks output, o
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  232) 	# %rdx: up to 4 data blocks input, i
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  233) 	# %rcx: input/output length in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  234) 	# %r8d: nrounds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  235) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  236) 	# This function encrypts four ChaCha blocks by loading the state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  237) 	# matrix four times across eight AVX registers. It performs matrix
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  238) 	# operations on four words in two matrices in parallel, sequentially
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  239) 	# to the operations on the four words of the other two matrices. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  240) 	# required word shuffling has a rather high latency, we can do the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  241) 	# arithmetic on two matrix-pairs without much slowdown.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  242) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  243) 	vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  244) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  245) 	# x0..3[0-4] = s0..3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  246) 	vbroadcasti128	0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  247) 	vbroadcasti128	0x10(%rdi),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  248) 	vbroadcasti128	0x20(%rdi),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  249) 	vbroadcasti128	0x30(%rdi),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  250) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  251) 	vmovdqa		%ymm0,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  252) 	vmovdqa		%ymm1,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  253) 	vmovdqa		%ymm2,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  254) 	vmovdqa		%ymm3,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  255) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  256) 	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  257) 	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  258) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  259) 	vmovdqa		%ymm0,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  260) 	vmovdqa		%ymm1,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  261) 	vmovdqa		%ymm2,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  262) 	vmovdqa		%ymm3,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  263) 	vmovdqa		%ymm7,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  264) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  265) 	vmovdqa		ROT8(%rip),%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  266) 	vmovdqa		ROT16(%rip),%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  267) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  268) 	mov		%rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  269) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  270) .Ldoubleround4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  271) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  272) 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  273) 	vpaddd		%ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  274) 	vpxor		%ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  275) 	vpshufb		%ymm9,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  276) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  277) 	vpaddd		%ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  278) 	vpxor		%ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  279) 	vpshufb		%ymm9,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  280) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  281) 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  282) 	vpaddd		%ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  283) 	vpxor		%ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  284) 	vmovdqa		%ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  285) 	vpslld		$12,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  286) 	vpsrld		$20,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  287) 	vpor		%ymm10,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  288) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  289) 	vpaddd		%ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  290) 	vpxor		%ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  291) 	vmovdqa		%ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  292) 	vpslld		$12,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  293) 	vpsrld		$20,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  294) 	vpor		%ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  295) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  296) 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  297) 	vpaddd		%ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  298) 	vpxor		%ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  299) 	vpshufb		%ymm8,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  300) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  301) 	vpaddd		%ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  302) 	vpxor		%ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  303) 	vpshufb		%ymm8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  304) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  305) 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  306) 	vpaddd		%ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  307) 	vpxor		%ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  308) 	vmovdqa		%ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  309) 	vpslld		$7,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  310) 	vpsrld		$25,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  311) 	vpor		%ymm10,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  312) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  313) 	vpaddd		%ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  314) 	vpxor		%ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  315) 	vmovdqa		%ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  316) 	vpslld		$7,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  317) 	vpsrld		$25,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  318) 	vpor		%ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  319) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  320) 	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  321) 	vpshufd		$0x39,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  322) 	vpshufd		$0x39,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  323) 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  324) 	vpshufd		$0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  325) 	vpshufd		$0x4e,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  326) 	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  327) 	vpshufd		$0x93,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  328) 	vpshufd		$0x93,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  329) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  330) 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  331) 	vpaddd		%ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  332) 	vpxor		%ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  333) 	vpshufb		%ymm9,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  334) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  335) 	vpaddd		%ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  336) 	vpxor		%ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  337) 	vpshufb		%ymm9,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  338) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  339) 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  340) 	vpaddd		%ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  341) 	vpxor		%ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  342) 	vmovdqa		%ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  343) 	vpslld		$12,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  344) 	vpsrld		$20,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  345) 	vpor		%ymm10,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  346) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  347) 	vpaddd		%ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  348) 	vpxor		%ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  349) 	vmovdqa		%ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  350) 	vpslld		$12,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  351) 	vpsrld		$20,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  352) 	vpor		%ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  353) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  354) 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  355) 	vpaddd		%ymm1,%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  356) 	vpxor		%ymm0,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  357) 	vpshufb		%ymm8,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  359) 	vpaddd		%ymm5,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  360) 	vpxor		%ymm4,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  361) 	vpshufb		%ymm8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  362) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  363) 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  364) 	vpaddd		%ymm3,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  365) 	vpxor		%ymm2,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  366) 	vmovdqa		%ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  367) 	vpslld		$7,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  368) 	vpsrld		$25,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  369) 	vpor		%ymm10,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  370) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  371) 	vpaddd		%ymm7,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  372) 	vpxor		%ymm6,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  373) 	vmovdqa		%ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  374) 	vpslld		$7,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  375) 	vpsrld		$25,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  376) 	vpor		%ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  377) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  378) 	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  379) 	vpshufd		$0x93,%ymm1,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  380) 	vpshufd		$0x93,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  381) 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  382) 	vpshufd		$0x4e,%ymm2,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  383) 	vpshufd		$0x4e,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  384) 	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  385) 	vpshufd		$0x39,%ymm3,%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  386) 	vpshufd		$0x39,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  387) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  388) 	sub		$2,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  389) 	jnz		.Ldoubleround4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  390) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  391) 	# o0 = i0 ^ (x0 + s0), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  392) 	vpaddd		%ymm11,%ymm0,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  393) 	cmp		$0x10,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  394) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  395) 	vpxor		0x00(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  396) 	vmovdqu		%xmm9,0x00(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  397) 	vextracti128	$1,%ymm10,%xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  398) 	# o1 = i1 ^ (x1 + s1), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  399) 	vpaddd		%ymm12,%ymm1,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  400) 	cmp		$0x20,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  401) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  402) 	vpxor		0x10(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  403) 	vmovdqu		%xmm9,0x10(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  404) 	vextracti128	$1,%ymm10,%xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  405) 	# o2 = i2 ^ (x2 + s2), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  406) 	vpaddd		%ymm13,%ymm2,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  407) 	cmp		$0x30,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  408) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  409) 	vpxor		0x20(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  410) 	vmovdqu		%xmm9,0x20(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  411) 	vextracti128	$1,%ymm10,%xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  412) 	# o3 = i3 ^ (x3 + s3), first block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  413) 	vpaddd		%ymm14,%ymm3,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  414) 	cmp		$0x40,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  415) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  416) 	vpxor		0x30(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  417) 	vmovdqu		%xmm9,0x30(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  418) 	vextracti128	$1,%ymm10,%xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  419) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  420) 	# xor and write second block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  421) 	vmovdqa		%xmm0,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  422) 	cmp		$0x50,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  423) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  424) 	vpxor		0x40(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  425) 	vmovdqu		%xmm9,0x40(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  426) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  427) 	vmovdqa		%xmm1,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  428) 	cmp		$0x60,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  429) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  430) 	vpxor		0x50(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  431) 	vmovdqu		%xmm9,0x50(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  432) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  433) 	vmovdqa		%xmm2,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  434) 	cmp		$0x70,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  435) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  436) 	vpxor		0x60(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  437) 	vmovdqu		%xmm9,0x60(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  438) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  439) 	vmovdqa		%xmm3,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  440) 	cmp		$0x80,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  441) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  442) 	vpxor		0x70(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  443) 	vmovdqu		%xmm9,0x70(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  444) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  445) 	# o0 = i0 ^ (x0 + s0), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  446) 	vpaddd		%ymm11,%ymm4,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  447) 	cmp		$0x90,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  448) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  449) 	vpxor		0x80(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  450) 	vmovdqu		%xmm9,0x80(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  451) 	vextracti128	$1,%ymm10,%xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  452) 	# o1 = i1 ^ (x1 + s1), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  453) 	vpaddd		%ymm12,%ymm5,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  454) 	cmp		$0xa0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  455) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  456) 	vpxor		0x90(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  457) 	vmovdqu		%xmm9,0x90(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  458) 	vextracti128	$1,%ymm10,%xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  459) 	# o2 = i2 ^ (x2 + s2), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  460) 	vpaddd		%ymm13,%ymm6,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  461) 	cmp		$0xb0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  462) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  463) 	vpxor		0xa0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  464) 	vmovdqu		%xmm9,0xa0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  465) 	vextracti128	$1,%ymm10,%xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  466) 	# o3 = i3 ^ (x3 + s3), third block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  467) 	vpaddd		%ymm15,%ymm7,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  468) 	cmp		$0xc0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  469) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  470) 	vpxor		0xb0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  471) 	vmovdqu		%xmm9,0xb0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  472) 	vextracti128	$1,%ymm10,%xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  473) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  474) 	# xor and write fourth block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  475) 	vmovdqa		%xmm4,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  476) 	cmp		$0xd0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  477) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  478) 	vpxor		0xc0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  479) 	vmovdqu		%xmm9,0xc0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  480) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  481) 	vmovdqa		%xmm5,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  482) 	cmp		$0xe0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  483) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  484) 	vpxor		0xd0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  485) 	vmovdqu		%xmm9,0xd0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  486) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  487) 	vmovdqa		%xmm6,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  488) 	cmp		$0xf0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  489) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  490) 	vpxor		0xe0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  491) 	vmovdqu		%xmm9,0xe0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  492) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  493) 	vmovdqa		%xmm7,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  494) 	cmp		$0x100,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  495) 	jl		.Lxorpart4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  496) 	vpxor		0xf0(%rdx),%xmm10,%xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  497) 	vmovdqu		%xmm9,0xf0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  498) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  499) .Ldone4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  500) 	vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  501) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  502) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  503) .Lxorpart4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  504) 	# xor remaining bytes from partial register into output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  505) 	mov		%rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  506) 	and		$0x0f,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  507) 	jz		.Ldone4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  508) 	and		$~0x0f,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  509) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  510) 	mov		%rsi,%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  511) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  512) 	lea		8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  513) 	sub		$0x10,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  514) 	and		$~31,%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  515) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  516) 	lea		(%rdx,%rax),%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  517) 	mov		%rsp,%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  518) 	mov		%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  519) 	rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  520) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  521) 	vpxor		0x00(%rsp),%xmm10,%xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  522) 	vmovdqa		%xmm10,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  523) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  524) 	mov		%rsp,%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  525) 	lea		(%r11,%rax),%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  526) 	mov		%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  527) 	rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  528) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  529) 	lea		-8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  530) 	jmp		.Ldone4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  531) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  532) SYM_FUNC_END(chacha_4block_xor_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  533) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  534) SYM_FUNC_START(chacha_8block_xor_avx2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  535) 	# %rdi: Input state matrix, s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  536) 	# %rsi: up to 8 data blocks output, o
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  537) 	# %rdx: up to 8 data blocks input, i
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  538) 	# %rcx: input/output length in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  539) 	# %r8d: nrounds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  540) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  541) 	# This function encrypts eight consecutive ChaCha blocks by loading
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  542) 	# the state matrix in AVX registers eight times. As we need some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  543) 	# scratch registers, we save the first four registers on the stack. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  544) 	# algorithm performs each operation on the corresponding word of each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  545) 	# state matrix, hence requires no word shuffling. For final XORing step
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  546) 	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  547) 	# words, which allows us to do XOR in AVX registers. 8/16-bit word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  548) 	# rotation is done with the slightly better performing byte shuffling,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  549) 	# 7/12-bit word rotation uses traditional shift+OR.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  550) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  551) 	vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  552) 	# 4 * 32 byte stack, 32-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  553) 	lea		8(%rsp),%r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  554) 	and		$~31, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  555) 	sub		$0x80, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  556) 	mov		%rcx,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  557) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  558) 	# x0..15[0-7] = s[0..15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  559) 	vpbroadcastd	0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  560) 	vpbroadcastd	0x04(%rdi),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  561) 	vpbroadcastd	0x08(%rdi),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  562) 	vpbroadcastd	0x0c(%rdi),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  563) 	vpbroadcastd	0x10(%rdi),%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  564) 	vpbroadcastd	0x14(%rdi),%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  565) 	vpbroadcastd	0x18(%rdi),%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  566) 	vpbroadcastd	0x1c(%rdi),%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  567) 	vpbroadcastd	0x20(%rdi),%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  568) 	vpbroadcastd	0x24(%rdi),%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  569) 	vpbroadcastd	0x28(%rdi),%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  570) 	vpbroadcastd	0x2c(%rdi),%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  571) 	vpbroadcastd	0x30(%rdi),%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  572) 	vpbroadcastd	0x34(%rdi),%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  573) 	vpbroadcastd	0x38(%rdi),%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  574) 	vpbroadcastd	0x3c(%rdi),%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  575) 	# x0..3 on stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  576) 	vmovdqa		%ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  577) 	vmovdqa		%ymm1,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  578) 	vmovdqa		%ymm2,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  579) 	vmovdqa		%ymm3,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  580) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  581) 	vmovdqa		CTRINC(%rip),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  582) 	vmovdqa		ROT8(%rip),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  583) 	vmovdqa		ROT16(%rip),%ymm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  584) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  585) 	# x12 += counter values 0-3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  586) 	vpaddd		%ymm1,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  587) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  588) .Ldoubleround8:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  589) 	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  590) 	vpaddd		0x00(%rsp),%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  591) 	vmovdqa		%ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  592) 	vpxor		%ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  593) 	vpshufb		%ymm3,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  594) 	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  595) 	vpaddd		0x20(%rsp),%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  596) 	vmovdqa		%ymm0,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  597) 	vpxor		%ymm0,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  598) 	vpshufb		%ymm3,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  599) 	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  600) 	vpaddd		0x40(%rsp),%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  601) 	vmovdqa		%ymm0,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  602) 	vpxor		%ymm0,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  603) 	vpshufb		%ymm3,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  604) 	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  605) 	vpaddd		0x60(%rsp),%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  606) 	vmovdqa		%ymm0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  607) 	vpxor		%ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  608) 	vpshufb		%ymm3,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  609) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  610) 	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  611) 	vpaddd		%ymm12,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  612) 	vpxor		%ymm8,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  613) 	vpslld		$12,%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  614) 	vpsrld		$20,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  615) 	vpor		%ymm0,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  616) 	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  617) 	vpaddd		%ymm13,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  618) 	vpxor		%ymm9,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  619) 	vpslld		$12,%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  620) 	vpsrld		$20,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  621) 	vpor		%ymm0,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  622) 	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  623) 	vpaddd		%ymm14,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  624) 	vpxor		%ymm10,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  625) 	vpslld		$12,%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  626) 	vpsrld		$20,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  627) 	vpor		%ymm0,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  628) 	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  629) 	vpaddd		%ymm15,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  630) 	vpxor		%ymm11,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  631) 	vpslld		$12,%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  632) 	vpsrld		$20,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  633) 	vpor		%ymm0,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  634) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  635) 	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  636) 	vpaddd		0x00(%rsp),%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  637) 	vmovdqa		%ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  638) 	vpxor		%ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  639) 	vpshufb		%ymm2,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  640) 	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  641) 	vpaddd		0x20(%rsp),%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  642) 	vmovdqa		%ymm0,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  643) 	vpxor		%ymm0,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  644) 	vpshufb		%ymm2,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  645) 	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  646) 	vpaddd		0x40(%rsp),%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  647) 	vmovdqa		%ymm0,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  648) 	vpxor		%ymm0,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  649) 	vpshufb		%ymm2,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  650) 	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  651) 	vpaddd		0x60(%rsp),%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  652) 	vmovdqa		%ymm0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  653) 	vpxor		%ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  654) 	vpshufb		%ymm2,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  655) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  656) 	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  657) 	vpaddd		%ymm12,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  658) 	vpxor		%ymm8,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  659) 	vpslld		$7,%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  660) 	vpsrld		$25,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  661) 	vpor		%ymm0,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  662) 	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  663) 	vpaddd		%ymm13,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  664) 	vpxor		%ymm9,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  665) 	vpslld		$7,%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  666) 	vpsrld		$25,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  667) 	vpor		%ymm0,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  668) 	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  669) 	vpaddd		%ymm14,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  670) 	vpxor		%ymm10,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  671) 	vpslld		$7,%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  672) 	vpsrld		$25,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  673) 	vpor		%ymm0,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  674) 	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  675) 	vpaddd		%ymm15,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  676) 	vpxor		%ymm11,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  677) 	vpslld		$7,%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  678) 	vpsrld		$25,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  679) 	vpor		%ymm0,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  680) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  681) 	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  682) 	vpaddd		0x00(%rsp),%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  683) 	vmovdqa		%ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  684) 	vpxor		%ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  685) 	vpshufb		%ymm3,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  686) 	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  687) 	vpaddd		0x20(%rsp),%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  688) 	vmovdqa		%ymm0,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  689) 	vpxor		%ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  690) 	vpshufb		%ymm3,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  691) 	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  692) 	vpaddd		0x40(%rsp),%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  693) 	vmovdqa		%ymm0,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  694) 	vpxor		%ymm0,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  695) 	vpshufb		%ymm3,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  696) 	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  697) 	vpaddd		0x60(%rsp),%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  698) 	vmovdqa		%ymm0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  699) 	vpxor		%ymm0,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  700) 	vpshufb		%ymm3,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  701) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  702) 	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  703) 	vpaddd		%ymm15,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  704) 	vpxor		%ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  705) 	vpslld		$12,%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  706) 	vpsrld		$20,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  707) 	vpor		%ymm0,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  708) 	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  709) 	vpaddd		%ymm12,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  710) 	vpxor		%ymm11,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  711) 	vpslld		$12,%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  712) 	vpsrld		$20,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  713) 	vpor		%ymm0,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  714) 	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  715) 	vpaddd		%ymm13,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  716) 	vpxor		%ymm8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  717) 	vpslld		$12,%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  718) 	vpsrld		$20,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  719) 	vpor		%ymm0,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  720) 	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  721) 	vpaddd		%ymm14,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  722) 	vpxor		%ymm9,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  723) 	vpslld		$12,%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  724) 	vpsrld		$20,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  725) 	vpor		%ymm0,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  726) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  727) 	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  728) 	vpaddd		0x00(%rsp),%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  729) 	vmovdqa		%ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  730) 	vpxor		%ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  731) 	vpshufb		%ymm2,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  732) 	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  733) 	vpaddd		0x20(%rsp),%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  734) 	vmovdqa		%ymm0,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  735) 	vpxor		%ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  736) 	vpshufb		%ymm2,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  737) 	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  738) 	vpaddd		0x40(%rsp),%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  739) 	vmovdqa		%ymm0,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  740) 	vpxor		%ymm0,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  741) 	vpshufb		%ymm2,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  742) 	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  743) 	vpaddd		0x60(%rsp),%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  744) 	vmovdqa		%ymm0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  745) 	vpxor		%ymm0,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  746) 	vpshufb		%ymm2,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  747) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  748) 	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  749) 	vpaddd		%ymm15,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  750) 	vpxor		%ymm10,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  751) 	vpslld		$7,%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  752) 	vpsrld		$25,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  753) 	vpor		%ymm0,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  754) 	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  755) 	vpaddd		%ymm12,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  756) 	vpxor		%ymm11,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  757) 	vpslld		$7,%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  758) 	vpsrld		$25,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  759) 	vpor		%ymm0,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  760) 	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  761) 	vpaddd		%ymm13,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  762) 	vpxor		%ymm8,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  763) 	vpslld		$7,%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  764) 	vpsrld		$25,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  765) 	vpor		%ymm0,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  766) 	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  767) 	vpaddd		%ymm14,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  768) 	vpxor		%ymm9,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  769) 	vpslld		$7,%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  770) 	vpsrld		$25,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  771) 	vpor		%ymm0,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  772) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  773) 	sub		$2,%r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  774) 	jnz		.Ldoubleround8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  775) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  776) 	# x0..15[0-3] += s[0..15]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  777) 	vpbroadcastd	0x00(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  778) 	vpaddd		0x00(%rsp),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  779) 	vmovdqa		%ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  780) 	vpbroadcastd	0x04(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  781) 	vpaddd		0x20(%rsp),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  782) 	vmovdqa		%ymm0,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  783) 	vpbroadcastd	0x08(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  784) 	vpaddd		0x40(%rsp),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  785) 	vmovdqa		%ymm0,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  786) 	vpbroadcastd	0x0c(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  787) 	vpaddd		0x60(%rsp),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  788) 	vmovdqa		%ymm0,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  789) 	vpbroadcastd	0x10(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  790) 	vpaddd		%ymm0,%ymm4,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  791) 	vpbroadcastd	0x14(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  792) 	vpaddd		%ymm0,%ymm5,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  793) 	vpbroadcastd	0x18(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  794) 	vpaddd		%ymm0,%ymm6,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  795) 	vpbroadcastd	0x1c(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  796) 	vpaddd		%ymm0,%ymm7,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  797) 	vpbroadcastd	0x20(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  798) 	vpaddd		%ymm0,%ymm8,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  799) 	vpbroadcastd	0x24(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  800) 	vpaddd		%ymm0,%ymm9,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  801) 	vpbroadcastd	0x28(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  802) 	vpaddd		%ymm0,%ymm10,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  803) 	vpbroadcastd	0x2c(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  804) 	vpaddd		%ymm0,%ymm11,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  805) 	vpbroadcastd	0x30(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  806) 	vpaddd		%ymm0,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  807) 	vpbroadcastd	0x34(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  808) 	vpaddd		%ymm0,%ymm13,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  809) 	vpbroadcastd	0x38(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  810) 	vpaddd		%ymm0,%ymm14,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  811) 	vpbroadcastd	0x3c(%rdi),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  812) 	vpaddd		%ymm0,%ymm15,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  813) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  814) 	# x12 += counter values 0-3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  815) 	vpaddd		%ymm1,%ymm12,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  816) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  817) 	# interleave 32-bit words in state n, n+1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  818) 	vmovdqa		0x00(%rsp),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  819) 	vmovdqa		0x20(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  820) 	vpunpckldq	%ymm1,%ymm0,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  821) 	vpunpckhdq	%ymm1,%ymm0,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  822) 	vmovdqa		%ymm2,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  823) 	vmovdqa		%ymm1,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  824) 	vmovdqa		0x40(%rsp),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  825) 	vmovdqa		0x60(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  826) 	vpunpckldq	%ymm1,%ymm0,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  827) 	vpunpckhdq	%ymm1,%ymm0,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  828) 	vmovdqa		%ymm2,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  829) 	vmovdqa		%ymm1,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  830) 	vmovdqa		%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  831) 	vpunpckldq	%ymm5,%ymm0,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  832) 	vpunpckhdq	%ymm5,%ymm0,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  833) 	vmovdqa		%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  834) 	vpunpckldq	%ymm7,%ymm0,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  835) 	vpunpckhdq	%ymm7,%ymm0,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  836) 	vmovdqa		%ymm8,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  837) 	vpunpckldq	%ymm9,%ymm0,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  838) 	vpunpckhdq	%ymm9,%ymm0,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  839) 	vmovdqa		%ymm10,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  840) 	vpunpckldq	%ymm11,%ymm0,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  841) 	vpunpckhdq	%ymm11,%ymm0,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  842) 	vmovdqa		%ymm12,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  843) 	vpunpckldq	%ymm13,%ymm0,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  844) 	vpunpckhdq	%ymm13,%ymm0,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  845) 	vmovdqa		%ymm14,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  846) 	vpunpckldq	%ymm15,%ymm0,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  847) 	vpunpckhdq	%ymm15,%ymm0,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  848) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  849) 	# interleave 64-bit words in state n, n+2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  850) 	vmovdqa		0x00(%rsp),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  851) 	vmovdqa		0x40(%rsp),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  852) 	vpunpcklqdq	%ymm2,%ymm0,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  853) 	vpunpckhqdq	%ymm2,%ymm0,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  854) 	vmovdqa		%ymm1,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  855) 	vmovdqa		%ymm2,0x40(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  856) 	vmovdqa		0x20(%rsp),%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  857) 	vmovdqa		0x60(%rsp),%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  858) 	vpunpcklqdq	%ymm2,%ymm0,%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  859) 	vpunpckhqdq	%ymm2,%ymm0,%ymm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  860) 	vmovdqa		%ymm1,0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  861) 	vmovdqa		%ymm2,0x60(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  862) 	vmovdqa		%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  863) 	vpunpcklqdq	%ymm6,%ymm0,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  864) 	vpunpckhqdq	%ymm6,%ymm0,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  865) 	vmovdqa		%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  866) 	vpunpcklqdq	%ymm7,%ymm0,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  867) 	vpunpckhqdq	%ymm7,%ymm0,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  868) 	vmovdqa		%ymm8,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  869) 	vpunpcklqdq	%ymm10,%ymm0,%ymm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  870) 	vpunpckhqdq	%ymm10,%ymm0,%ymm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  871) 	vmovdqa		%ymm9,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  872) 	vpunpcklqdq	%ymm11,%ymm0,%ymm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  873) 	vpunpckhqdq	%ymm11,%ymm0,%ymm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  874) 	vmovdqa		%ymm12,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  875) 	vpunpcklqdq	%ymm14,%ymm0,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  876) 	vpunpckhqdq	%ymm14,%ymm0,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  877) 	vmovdqa		%ymm13,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  878) 	vpunpcklqdq	%ymm15,%ymm0,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  879) 	vpunpckhqdq	%ymm15,%ymm0,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  880) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  881) 	# interleave 128-bit words in state n, n+4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  882) 	# xor/write first four blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  883) 	vmovdqa		0x00(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  884) 	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  885) 	cmp		$0x0020,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  886) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  887) 	vpxor		0x0000(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  888) 	vmovdqu		%ymm0,0x0000(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  889) 	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  890) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  891) 	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  892) 	cmp		$0x0040,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  893) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  894) 	vpxor		0x0020(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  895) 	vmovdqu		%ymm0,0x0020(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  896) 	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  897) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  898) 	vmovdqa		0x40(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  899) 	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  900) 	cmp		$0x0060,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  901) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  902) 	vpxor		0x0040(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  903) 	vmovdqu		%ymm0,0x0040(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  904) 	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  905) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  906) 	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  907) 	cmp		$0x0080,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  908) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  909) 	vpxor		0x0060(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  910) 	vmovdqu		%ymm0,0x0060(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  911) 	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  912) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  913) 	vmovdqa		0x20(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  914) 	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  915) 	cmp		$0x00a0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  916) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  917) 	vpxor		0x0080(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  918) 	vmovdqu		%ymm0,0x0080(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  919) 	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  920) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  921) 	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  922) 	cmp		$0x00c0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  923) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  924) 	vpxor		0x00a0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  925) 	vmovdqu		%ymm0,0x00a0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  926) 	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  927) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  928) 	vmovdqa		0x60(%rsp),%ymm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  929) 	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  930) 	cmp		$0x00e0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  931) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  932) 	vpxor		0x00c0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  933) 	vmovdqu		%ymm0,0x00c0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  934) 	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  935) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  936) 	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  937) 	cmp		$0x0100,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  938) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  939) 	vpxor		0x00e0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  940) 	vmovdqu		%ymm0,0x00e0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  941) 	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  942) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  943) 	# xor remaining blocks, write to output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  944) 	vmovdqa		%ymm4,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  945) 	cmp		$0x0120,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  946) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  947) 	vpxor		0x0100(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  948) 	vmovdqu		%ymm0,0x0100(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  949) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  950) 	vmovdqa		%ymm12,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  951) 	cmp		$0x0140,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  952) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  953) 	vpxor		0x0120(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  954) 	vmovdqu		%ymm0,0x0120(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  955) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  956) 	vmovdqa		%ymm6,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  957) 	cmp		$0x0160,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  958) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  959) 	vpxor		0x0140(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  960) 	vmovdqu		%ymm0,0x0140(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  961) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  962) 	vmovdqa		%ymm14,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  963) 	cmp		$0x0180,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  964) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  965) 	vpxor		0x0160(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  966) 	vmovdqu		%ymm0,0x0160(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  967) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  968) 	vmovdqa		%ymm5,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  969) 	cmp		$0x01a0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  970) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  971) 	vpxor		0x0180(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  972) 	vmovdqu		%ymm0,0x0180(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  973) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  974) 	vmovdqa		%ymm13,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  975) 	cmp		$0x01c0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  976) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  977) 	vpxor		0x01a0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  978) 	vmovdqu		%ymm0,0x01a0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  979) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  980) 	vmovdqa		%ymm7,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  981) 	cmp		$0x01e0,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  982) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  983) 	vpxor		0x01c0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  984) 	vmovdqu		%ymm0,0x01c0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  985) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  986) 	vmovdqa		%ymm15,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  987) 	cmp		$0x0200,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  988) 	jl		.Lxorpart8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  989) 	vpxor		0x01e0(%rdx),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  990) 	vmovdqu		%ymm0,0x01e0(%rsi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  991) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  992) .Ldone8:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  993) 	vzeroupper
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  994) 	lea		-8(%r10),%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  995) 	ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  996) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  997) .Lxorpart8:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  998) 	# xor remaining bytes from partial register into output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  999) 	mov		%rax,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) 	and		$0x1f,%r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) 	jz		.Ldone8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) 	and		$~0x1f,%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) 	mov		%rsi,%r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) 	lea		(%rdx,%rax),%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) 	mov		%rsp,%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) 	mov		%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) 	rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) 	vpxor		0x00(%rsp),%ymm0,%ymm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) 	vmovdqa		%ymm0,0x00(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) 	mov		%rsp,%rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) 	lea		(%r11,%rax),%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) 	mov		%r9,%rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) 	rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) 	jmp		.Ldone8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) SYM_FUNC_END(chacha_8block_xor_avx2)