Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    1) /* SPDX-License-Identifier: GPL-2.0-or-later */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    3)  * x86_64/AVX2/AES-NI assembler implementation of Camellia
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    4)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    5)  * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    6)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    7) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    8) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    9) #include <asm/frame.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   10) #include <asm/nospec-branch.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   11) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   12) #define CAMELLIA_TABLE_BYTE_LEN 272
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   13) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   14) /* struct camellia_ctx: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   15) #define key_table 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   16) #define key_length CAMELLIA_TABLE_BYTE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   17) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   18) /* register macros */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   19) #define CTX %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   20) #define RIO %r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   21) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   22) /**********************************************************************
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   23)   helper macros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   24)  **********************************************************************/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   25) #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   26) 	vpand x, mask4bit, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   27) 	vpandn x, mask4bit, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   28) 	vpsrld $4, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   29) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   30) 	vpshufb tmp0, lo_t, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   31) 	vpshufb x, hi_t, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   32) 	vpxor tmp0, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   33) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   34) #define ymm0_x xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   35) #define ymm1_x xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   36) #define ymm2_x xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   37) #define ymm3_x xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   38) #define ymm4_x xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   39) #define ymm5_x xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   40) #define ymm6_x xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   41) #define ymm7_x xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   42) #define ymm8_x xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   43) #define ymm9_x xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   44) #define ymm10_x xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   45) #define ymm11_x xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   46) #define ymm12_x xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   47) #define ymm13_x xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   48) #define ymm14_x xmm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   49) #define ymm15_x xmm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   50) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   51) /**********************************************************************
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   52)   32-way camellia
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   53)  **********************************************************************/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   54) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   55) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   56)  * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   57)  *   x0..x7: byte-sliced AB state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   58)  *   mem_cd: register pointer storing CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   59)  *   key: index for key material
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   60)  * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   61)  *   x0..x7: new byte-sliced CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   62)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   63) #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   64) 		  t7, mem_cd, key) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   65) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   66) 	 * S-function with AES subbytes \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   67) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   68) 	vbroadcasti128 .Linv_shift_row, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   69) 	vpbroadcastd .L0f0f0f0f, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   70) 	vbroadcasti128 .Lpre_tf_lo_s1, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   71) 	vbroadcasti128 .Lpre_tf_hi_s1, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   72) 	vbroadcasti128 .Lpre_tf_lo_s4, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   73) 	vbroadcasti128 .Lpre_tf_hi_s4, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   74) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   75) 	/* AES inverse shift rows */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   76) 	vpshufb t4, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   77) 	vpshufb t4, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   78) 	vpshufb t4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   79) 	vpshufb t4, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   80) 	vpshufb t4, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   81) 	vpshufb t4, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   82) 	vpshufb t4, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   83) 	vpshufb t4, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   84) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   85) 	/* prefilter sboxes 1, 2 and 3 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   86) 	/* prefilter sbox 4 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   87) 	filter_8bit(x0, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   88) 	filter_8bit(x7, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   89) 	vextracti128 $1, x0, t0##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   90) 	vextracti128 $1, x7, t1##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   91) 	filter_8bit(x3, t2, t3, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   92) 	filter_8bit(x6, t2, t3, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   93) 	vextracti128 $1, x3, t3##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   94) 	vextracti128 $1, x6, t2##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   95) 	filter_8bit(x2, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   96) 	filter_8bit(x5, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   97) 	filter_8bit(x1, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   98) 	filter_8bit(x4, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   99) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  100) 	vpxor t4##_x, t4##_x, t4##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  101) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  102) 	/* AES subbytes + AES shift rows */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  103) 	vextracti128 $1, x2, t6##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  104) 	vextracti128 $1, x5, t5##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  105) 	vaesenclast t4##_x, x0##_x, x0##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  106) 	vaesenclast t4##_x, t0##_x, t0##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  107) 	vinserti128 $1, t0##_x, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  108) 	vaesenclast t4##_x, x7##_x, x7##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  109) 	vaesenclast t4##_x, t1##_x, t1##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  110) 	vinserti128 $1, t1##_x, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  111) 	vaesenclast t4##_x, x3##_x, x3##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  112) 	vaesenclast t4##_x, t3##_x, t3##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  113) 	vinserti128 $1, t3##_x, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  114) 	vaesenclast t4##_x, x6##_x, x6##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  115) 	vaesenclast t4##_x, t2##_x, t2##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  116) 	vinserti128 $1, t2##_x, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  117) 	vextracti128 $1, x1, t3##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  118) 	vextracti128 $1, x4, t2##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  119) 	vbroadcasti128 .Lpost_tf_lo_s1, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  120) 	vbroadcasti128 .Lpost_tf_hi_s1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  121) 	vaesenclast t4##_x, x2##_x, x2##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  122) 	vaesenclast t4##_x, t6##_x, t6##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  123) 	vinserti128 $1, t6##_x, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  124) 	vaesenclast t4##_x, x5##_x, x5##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  125) 	vaesenclast t4##_x, t5##_x, t5##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  126) 	vinserti128 $1, t5##_x, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  127) 	vaesenclast t4##_x, x1##_x, x1##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  128) 	vaesenclast t4##_x, t3##_x, t3##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  129) 	vinserti128 $1, t3##_x, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  130) 	vaesenclast t4##_x, x4##_x, x4##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  131) 	vaesenclast t4##_x, t2##_x, t2##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  132) 	vinserti128 $1, t2##_x, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  133) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  134) 	/* postfilter sboxes 1 and 4 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  135) 	vbroadcasti128 .Lpost_tf_lo_s3, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  136) 	vbroadcasti128 .Lpost_tf_hi_s3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  137) 	filter_8bit(x0, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  138) 	filter_8bit(x7, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  139) 	filter_8bit(x3, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  140) 	filter_8bit(x6, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  141) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  142) 	/* postfilter sbox 3 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  143) 	vbroadcasti128 .Lpost_tf_lo_s2, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  144) 	vbroadcasti128 .Lpost_tf_hi_s2, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  145) 	filter_8bit(x2, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  146) 	filter_8bit(x5, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  147) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  148) 	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  149) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  150) 	/* postfilter sbox 2 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  151) 	filter_8bit(x1, t4, t5, t7, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  152) 	filter_8bit(x4, t4, t5, t7, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  153) 	vpxor t7, t7, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  154) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  155) 	vpsrldq $1, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  156) 	vpsrldq $2, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  157) 	vpshufb t7, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  158) 	vpsrldq $3, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  159) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  160) 	/* P-function */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  161) 	vpxor x5, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  162) 	vpxor x6, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  163) 	vpxor x7, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  164) 	vpxor x4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  165) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  166) 	vpshufb t7, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  167) 	vpsrldq $4, t0, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  168) 	vpshufb t7, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  169) 	vpsrldq $5, t0, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  170) 	vpshufb t7, t4, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  171) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  172) 	vpxor x2, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  173) 	vpxor x3, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  174) 	vpxor x0, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  175) 	vpxor x1, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  176) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  177) 	vpsrldq $6, t0, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  178) 	vpshufb t7, t5, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  179) 	vpshufb t7, t6, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  180) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  181) 	vpxor x7, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  182) 	vpxor x4, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  183) 	vpxor x5, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  184) 	vpxor x6, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  185) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  186) 	vpxor x3, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  187) 	vpxor x0, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  188) 	vpxor x1, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  189) 	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  190) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  191) 	/* Add key material and result to CD (x becomes new CD) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  192) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  193) 	vpxor t6, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  194) 	vpxor 5 * 32(mem_cd), x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  195) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  196) 	vpsrldq $7, t0, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  197) 	vpshufb t7, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  198) 	vpshufb t7, t6, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  199) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  200) 	vpxor t7, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  201) 	vpxor 4 * 32(mem_cd), x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  202) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  203) 	vpxor t5, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  204) 	vpxor 6 * 32(mem_cd), x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  205) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  206) 	vpxor t4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  207) 	vpxor 7 * 32(mem_cd), x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  208) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  209) 	vpxor t3, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  210) 	vpxor 0 * 32(mem_cd), x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  211) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  212) 	vpxor t2, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  213) 	vpxor 1 * 32(mem_cd), x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  214) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  215) 	vpxor t1, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  216) 	vpxor 2 * 32(mem_cd), x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  217) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  218) 	vpxor t0, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  219) 	vpxor 3 * 32(mem_cd), x7, x7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  220) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  221) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  222)  * Size optimization... with inlined roundsm32 binary would be over 5 times
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  223)  * larger and would only marginally faster.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  224)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  225) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  226) SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  227) 	roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  228) 		  %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  229) 		  %rcx, (%r9));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  230) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  231) SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  232) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  233) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  234) SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  235) 	roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  236) 		  %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  237) 		  %rax, (%r9));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  238) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  239) SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  240) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  241) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  242)  * IN/OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  243)  *  x0..x7: byte-sliced AB state preloaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  244)  *  mem_ab: byte-sliced AB state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  245)  *  mem_cb: byte-sliced CD state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  246)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  247) #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  248) 		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  249) 	leaq (key_table + (i) * 8)(CTX), %r9; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  250) 	call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  251) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  252) 	vmovdqu x0, 4 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  253) 	vmovdqu x1, 5 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  254) 	vmovdqu x2, 6 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  255) 	vmovdqu x3, 7 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  256) 	vmovdqu x4, 0 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  257) 	vmovdqu x5, 1 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  258) 	vmovdqu x6, 2 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  259) 	vmovdqu x7, 3 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  260) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  261) 	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  262) 	call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  263) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  264) 	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  265) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  266) #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  267) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  268) #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  269) 	/* Store new AB state */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  270) 	vmovdqu x4, 4 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  271) 	vmovdqu x5, 5 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  272) 	vmovdqu x6, 6 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  273) 	vmovdqu x7, 7 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  274) 	vmovdqu x0, 0 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  275) 	vmovdqu x1, 1 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  276) 	vmovdqu x2, 2 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  277) 	vmovdqu x3, 3 * 32(mem_ab);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  278) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  279) #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  280) 		      y6, y7, mem_ab, mem_cd, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  281) 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  282) 		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  283) 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  284) 		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  285) 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  286) 		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  287) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  288) #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  289) 		      y6, y7, mem_ab, mem_cd, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  290) 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  291) 		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  292) 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  293) 		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  294) 	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  295) 		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  296) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  297) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  298)  * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  299)  *  v0..3: byte-sliced 32-bit integers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  300)  * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  301)  *  v0..3: (IN <<< 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  302)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  303) #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  304) 	vpcmpgtb v0, zero, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  305) 	vpaddb v0, v0, v0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  306) 	vpabsb t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  307) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  308) 	vpcmpgtb v1, zero, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  309) 	vpaddb v1, v1, v1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  310) 	vpabsb t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  311) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  312) 	vpcmpgtb v2, zero, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  313) 	vpaddb v2, v2, v2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  314) 	vpabsb t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  315) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  316) 	vpor t0, v1, v1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  317) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  318) 	vpcmpgtb v3, zero, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  319) 	vpaddb v3, v3, v3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  320) 	vpabsb t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  321) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  322) 	vpor t1, v2, v2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  323) 	vpor t2, v3, v3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  324) 	vpor t0, v0, v0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  325) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  326) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  327)  * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  328)  *   r: byte-sliced AB state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  329)  *   l: byte-sliced CD state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  330)  * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  331)  *   x0..x7: new byte-sliced CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  332)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  333) #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  334) 	      tt1, tt2, tt3, kll, klr, krl, krr) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  335) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  336) 	 * t0 = kll; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  337) 	 * t0 &= ll; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  338) 	 * lr ^= rol32(t0, 1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  339) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  340) 	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  341) 	vpxor tt0, tt0, tt0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  342) 	vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  343) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  344) 	vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  345) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  346) 	vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  347) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  348) 	vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  349) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  350) 	vpand l0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  351) 	vpand l1, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  352) 	vpand l2, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  353) 	vpand l3, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  354) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  355) 	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  356) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  357) 	vpxor l4, t0, l4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  358) 	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  359) 	vmovdqu l4, 4 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  360) 	vpxor l5, t1, l5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  361) 	vmovdqu l5, 5 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  362) 	vpxor l6, t2, l6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  363) 	vmovdqu l6, 6 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  364) 	vpxor l7, t3, l7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  365) 	vmovdqu l7, 7 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  366) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  367) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  368) 	 * t2 = krr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  369) 	 * t2 |= rr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  370) 	 * rl ^= t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  371) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  372) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  373) 	vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  374) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  375) 	vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  376) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  377) 	vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  378) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  379) 	vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  380) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  381) 	vpor 4 * 32(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  382) 	vpor 5 * 32(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  383) 	vpor 6 * 32(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  384) 	vpor 7 * 32(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  385) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  386) 	vpxor 0 * 32(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  387) 	vpxor 1 * 32(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  388) 	vpxor 2 * 32(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  389) 	vpxor 3 * 32(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  390) 	vmovdqu t0, 0 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  391) 	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  392) 	vmovdqu t1, 1 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  393) 	vmovdqu t2, 2 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  394) 	vmovdqu t3, 3 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  395) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  396) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  397) 	 * t2 = krl; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  398) 	 * t2 &= rl; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  399) 	 * rr ^= rol32(t2, 1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  400) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  401) 	vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  402) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  403) 	vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  404) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  405) 	vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  406) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  407) 	vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  408) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  409) 	vpand 0 * 32(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  410) 	vpand 1 * 32(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  411) 	vpand 2 * 32(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  412) 	vpand 3 * 32(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  413) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  414) 	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  415) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  416) 	vpxor 4 * 32(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  417) 	vpxor 5 * 32(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  418) 	vpxor 6 * 32(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  419) 	vpxor 7 * 32(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  420) 	vmovdqu t0, 4 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  421) 	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  422) 	vmovdqu t1, 5 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  423) 	vmovdqu t2, 6 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  424) 	vmovdqu t3, 7 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  425) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  426) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  427) 	 * t0 = klr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  428) 	 * t0 |= lr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  429) 	 * ll ^= t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  430) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  431) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  432) 	vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  433) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  434) 	vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  435) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  436) 	vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  437) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  438) 	vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  439) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  440) 	vpor l4, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  441) 	vpor l5, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  442) 	vpor l6, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  443) 	vpor l7, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  444) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  445) 	vpxor l0, t0, l0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  446) 	vmovdqu l0, 0 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  447) 	vpxor l1, t1, l1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  448) 	vmovdqu l1, 1 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  449) 	vpxor l2, t2, l2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  450) 	vmovdqu l2, 2 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  451) 	vpxor l3, t3, l3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  452) 	vmovdqu l3, 3 * 32(l);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  453) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  454) #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  455) 	vpunpckhdq x1, x0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  456) 	vpunpckldq x1, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  457) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  458) 	vpunpckldq x3, x2, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  459) 	vpunpckhdq x3, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  460) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  461) 	vpunpckhqdq t1, x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  462) 	vpunpcklqdq t1, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  463) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  464) 	vpunpckhqdq x2, t2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  465) 	vpunpcklqdq x2, t2, x2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  466) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  467) #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  468) 			      a3, b3, c3, d3, st0, st1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  469) 	vmovdqu d2, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  470) 	vmovdqu d3, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  471) 	transpose_4x4(a0, a1, a2, a3, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  472) 	transpose_4x4(b0, b1, b2, b3, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  473) 	vmovdqu st0, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  474) 	vmovdqu st1, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  475) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  476) 	vmovdqu a0, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  477) 	vmovdqu a1, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  478) 	transpose_4x4(c0, c1, c2, c3, a0, a1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  479) 	transpose_4x4(d0, d1, d2, d3, a0, a1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  480) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  481) 	vbroadcasti128 .Lshufb_16x16b, a0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  482) 	vmovdqu st1, a1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  483) 	vpshufb a0, a2, a2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  484) 	vpshufb a0, a3, a3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  485) 	vpshufb a0, b0, b0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  486) 	vpshufb a0, b1, b1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  487) 	vpshufb a0, b2, b2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  488) 	vpshufb a0, b3, b3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  489) 	vpshufb a0, a1, a1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  490) 	vpshufb a0, c0, c0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  491) 	vpshufb a0, c1, c1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  492) 	vpshufb a0, c2, c2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  493) 	vpshufb a0, c3, c3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  494) 	vpshufb a0, d0, d0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  495) 	vpshufb a0, d1, d1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  496) 	vpshufb a0, d2, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  497) 	vpshufb a0, d3, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  498) 	vmovdqu d3, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  499) 	vmovdqu st0, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  500) 	vpshufb a0, d3, a0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  501) 	vmovdqu d2, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  502) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  503) 	transpose_4x4(a0, b0, c0, d0, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  504) 	transpose_4x4(a1, b1, c1, d1, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  505) 	vmovdqu st0, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  506) 	vmovdqu st1, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  507) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  508) 	vmovdqu b0, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  509) 	vmovdqu b1, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  510) 	transpose_4x4(a2, b2, c2, d2, b0, b1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  511) 	transpose_4x4(a3, b3, c3, d3, b0, b1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  512) 	vmovdqu st0, b0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  513) 	vmovdqu st1, b1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  514) 	/* does not adjust output bytes inside vectors */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  515) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  516) /* load blocks to registers and apply pre-whitening */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  517) #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  518) 		     y6, y7, rio, key) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  519) 	vpbroadcastq key, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  520) 	vpshufb .Lpack_bswap, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  521) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  522) 	vpxor 0 * 32(rio), x0, y7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  523) 	vpxor 1 * 32(rio), x0, y6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  524) 	vpxor 2 * 32(rio), x0, y5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  525) 	vpxor 3 * 32(rio), x0, y4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  526) 	vpxor 4 * 32(rio), x0, y3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  527) 	vpxor 5 * 32(rio), x0, y2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  528) 	vpxor 6 * 32(rio), x0, y1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  529) 	vpxor 7 * 32(rio), x0, y0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  530) 	vpxor 8 * 32(rio), x0, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  531) 	vpxor 9 * 32(rio), x0, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  532) 	vpxor 10 * 32(rio), x0, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  533) 	vpxor 11 * 32(rio), x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  534) 	vpxor 12 * 32(rio), x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  535) 	vpxor 13 * 32(rio), x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  536) 	vpxor 14 * 32(rio), x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  537) 	vpxor 15 * 32(rio), x0, x0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  538) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  539) /* byteslice pre-whitened blocks and store to temporary memory */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  540) #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  541) 		      y6, y7, mem_ab, mem_cd) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  542) 	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  543) 			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  544) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  545) 	vmovdqu x0, 0 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  546) 	vmovdqu x1, 1 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  547) 	vmovdqu x2, 2 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  548) 	vmovdqu x3, 3 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  549) 	vmovdqu x4, 4 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  550) 	vmovdqu x5, 5 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  551) 	vmovdqu x6, 6 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  552) 	vmovdqu x7, 7 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  553) 	vmovdqu y0, 0 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  554) 	vmovdqu y1, 1 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  555) 	vmovdqu y2, 2 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  556) 	vmovdqu y3, 3 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  557) 	vmovdqu y4, 4 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  558) 	vmovdqu y5, 5 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  559) 	vmovdqu y6, 6 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  560) 	vmovdqu y7, 7 * 32(mem_cd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  561) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  562) /* de-byteslice, apply post-whitening and store blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  563) #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  564) 		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  565) 	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  566) 			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  567) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  568) 	vmovdqu x0, stack_tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  569) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  570) 	vpbroadcastq key, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  571) 	vpshufb .Lpack_bswap, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  572) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  573) 	vpxor x0, y7, y7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  574) 	vpxor x0, y6, y6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  575) 	vpxor x0, y5, y5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  576) 	vpxor x0, y4, y4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  577) 	vpxor x0, y3, y3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  578) 	vpxor x0, y2, y2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  579) 	vpxor x0, y1, y1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  580) 	vpxor x0, y0, y0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  581) 	vpxor x0, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  582) 	vpxor x0, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  583) 	vpxor x0, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  584) 	vpxor x0, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  585) 	vpxor x0, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  586) 	vpxor x0, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  587) 	vpxor x0, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  588) 	vpxor stack_tmp0, x0, x0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  589) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  590) #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  591) 		     y6, y7, rio) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  592) 	vmovdqu x0, 0 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  593) 	vmovdqu x1, 1 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  594) 	vmovdqu x2, 2 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  595) 	vmovdqu x3, 3 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  596) 	vmovdqu x4, 4 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  597) 	vmovdqu x5, 5 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  598) 	vmovdqu x6, 6 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  599) 	vmovdqu x7, 7 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  600) 	vmovdqu y0, 8 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  601) 	vmovdqu y1, 9 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  602) 	vmovdqu y2, 10 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  603) 	vmovdqu y3, 11 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  604) 	vmovdqu y4, 12 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  605) 	vmovdqu y5, 13 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  606) 	vmovdqu y6, 14 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  607) 	vmovdqu y7, 15 * 32(rio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  608) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  609) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  610) .section	.rodata.cst32.shufb_16x16b, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  611) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  612) #define SHUFB_BYTES(idx) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  613) 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  614) .Lshufb_16x16b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  615) 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  616) 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  617) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  618) .section	.rodata.cst32.pack_bswap, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  619) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  620) .Lpack_bswap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  621) 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  622) 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  623) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  624) /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  625) .section	.rodata.cst16, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  626) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  627) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  628) /* For CTR-mode IV byteswap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  629) .Lbswap128_mask:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  630) 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  631) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  632) /* For XTS mode */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  633) .Lxts_gf128mul_and_shl1_mask_0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  634) 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  635) .Lxts_gf128mul_and_shl1_mask_1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  636) 	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  637) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  638) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  639)  * pre-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  640)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  641)  * pre-lookup for sbox1, sbox2, sbox3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  642)  *   swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  643)  *       isom_map_camellia_to_aes(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  644)  *           camellia_f(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  645)  *               swap_bitendianess(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  646)  *           )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  647)  *       )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  648)  *   )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  649)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  650)  * (note: '⊕ 0xc5' inside camellia_f())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  651)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  652) .Lpre_tf_lo_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  653) 	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  654) 	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  655) .Lpre_tf_hi_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  656) 	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  657) 	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  658) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  659) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  660)  * pre-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  661)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  662)  * pre-lookup for sbox4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  663)  *   swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  664)  *       isom_map_camellia_to_aes(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  665)  *           camellia_f(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  666)  *               swap_bitendianess(in <<< 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  667)  *           )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  668)  *       )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  669)  *   )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  670)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  671)  * (note: '⊕ 0xc5' inside camellia_f())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  672)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  673) .Lpre_tf_lo_s4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  674) 	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  675) 	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  676) .Lpre_tf_hi_s4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  677) 	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  678) 	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  679) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  680) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  681)  * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  682)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  683)  * post-lookup for sbox1, sbox4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  684)  *  swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  685)  *      camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  686)  *          isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  687)  *              swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  688)  *                  aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  689)  *              )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  690)  *          )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  691)  *      )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  692)  *  )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  693)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  694)  * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  695)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  696) .Lpost_tf_lo_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  697) 	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  698) 	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  699) .Lpost_tf_hi_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  700) 	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  701) 	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  702) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  703) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  704)  * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  705)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  706)  * post-lookup for sbox2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  707)  *  swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  708)  *      camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  709)  *          isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  710)  *              swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  711)  *                  aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  712)  *              )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  713)  *          )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  714)  *      )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  715)  *  ) <<< 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  716)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  717)  * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  718)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  719) .Lpost_tf_lo_s2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  720) 	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  721) 	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  722) .Lpost_tf_hi_s2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  723) 	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  724) 	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  725) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  726) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  727)  * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  728)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  729)  * post-lookup for sbox3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  730)  *  swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  731)  *      camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  732)  *          isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  733)  *              swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  734)  *                  aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  735)  *              )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  736)  *          )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  737)  *      )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  738)  *  ) >>> 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  739)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  740)  * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  741)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  742) .Lpost_tf_lo_s3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  743) 	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  744) 	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  745) .Lpost_tf_hi_s3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  746) 	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  747) 	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  748) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  749) /* For isolating SubBytes from AESENCLAST, inverse shift row */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  750) .Linv_shift_row:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  751) 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  752) 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  753) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  754) .section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  755) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  756) /* 4-bit mask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  757) .L0f0f0f0f:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  758) 	.long 0x0f0f0f0f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  759) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  760) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  761) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  762) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  763) SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  764) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  765) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  766) 	 *	%rax: temporary storage, 512 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  767) 	 *	%ymm0..%ymm15: 32 plaintext blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  768) 	 * output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  769) 	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  770) 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  771) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  772) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  773) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  774) 	leaq 8 * 32(%rax), %rcx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  775) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  776) 	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  777) 		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  778) 		      %ymm15, %rax, %rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  779) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  780) 	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  781) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  782) 		     %ymm15, %rax, %rcx, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  783) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  784) 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  785) 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  786) 	      %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  787) 	      ((key_table + (8) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  788) 	      ((key_table + (8) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  789) 	      ((key_table + (8) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  790) 	      ((key_table + (8) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  791) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  792) 	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  793) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  794) 		     %ymm15, %rax, %rcx, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  795) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  796) 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  797) 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  798) 	      %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  799) 	      ((key_table + (16) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  800) 	      ((key_table + (16) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  801) 	      ((key_table + (16) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  802) 	      ((key_table + (16) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  803) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  804) 	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  805) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  806) 		     %ymm15, %rax, %rcx, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  807) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  808) 	movl $24, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  809) 	cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  810) 	jne .Lenc_max32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  811) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  812) .Lenc_done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  813) 	/* load CD for output */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  814) 	vmovdqu 0 * 32(%rcx), %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  815) 	vmovdqu 1 * 32(%rcx), %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  816) 	vmovdqu 2 * 32(%rcx), %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  817) 	vmovdqu 3 * 32(%rcx), %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  818) 	vmovdqu 4 * 32(%rcx), %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  819) 	vmovdqu 5 * 32(%rcx), %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  820) 	vmovdqu 6 * 32(%rcx), %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  821) 	vmovdqu 7 * 32(%rcx), %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  822) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  823) 	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  824) 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  825) 		    %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  826) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  827) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  828) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  829) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  830) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  831) .Lenc_max32:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  832) 	movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  833) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  834) 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  835) 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  836) 	      %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  837) 	      ((key_table + (24) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  838) 	      ((key_table + (24) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  839) 	      ((key_table + (24) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  840) 	      ((key_table + (24) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  841) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  842) 	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  843) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  844) 		     %ymm15, %rax, %rcx, 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  845) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  846) 	jmp .Lenc_done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  847) SYM_FUNC_END(__camellia_enc_blk32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  848) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  849) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  850) SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  851) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  852) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  853) 	 *	%rax: temporary storage, 512 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  854) 	 *	%r8d: 24 for 16 byte key, 32 for larger
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  855) 	 *	%ymm0..%ymm15: 16 encrypted blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  856) 	 * output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  857) 	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  858) 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  859) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  860) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  861) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  862) 	leaq 8 * 32(%rax), %rcx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  863) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  864) 	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  865) 		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  866) 		      %ymm15, %rax, %rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  867) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  868) 	cmpl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  869) 	je .Ldec_max32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  870) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  871) .Ldec_max24:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  872) 	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  873) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  874) 		     %ymm15, %rax, %rcx, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  875) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  876) 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  877) 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  878) 	      %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  879) 	      ((key_table + (16) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  880) 	      ((key_table + (16) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  881) 	      ((key_table + (16) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  882) 	      ((key_table + (16) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  883) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  884) 	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  885) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  886) 		     %ymm15, %rax, %rcx, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  887) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  888) 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  889) 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  890) 	      %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  891) 	      ((key_table + (8) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  892) 	      ((key_table + (8) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  893) 	      ((key_table + (8) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  894) 	      ((key_table + (8) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  895) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  896) 	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  897) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  898) 		     %ymm15, %rax, %rcx, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  899) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  900) 	/* load CD for output */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  901) 	vmovdqu 0 * 32(%rcx), %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  902) 	vmovdqu 1 * 32(%rcx), %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  903) 	vmovdqu 2 * 32(%rcx), %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  904) 	vmovdqu 3 * 32(%rcx), %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  905) 	vmovdqu 4 * 32(%rcx), %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  906) 	vmovdqu 5 * 32(%rcx), %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  907) 	vmovdqu 6 * 32(%rcx), %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  908) 	vmovdqu 7 * 32(%rcx), %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  909) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  910) 	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  911) 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  912) 		    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  913) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  914) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  915) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  916) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  917) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  918) .Ldec_max32:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  919) 	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  920) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  921) 		     %ymm15, %rax, %rcx, 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  922) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  923) 	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  924) 	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  925) 	      %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  926) 	      ((key_table + (24) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  927) 	      ((key_table + (24) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  928) 	      ((key_table + (24) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  929) 	      ((key_table + (24) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  930) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  931) 	jmp .Ldec_max24;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  932) SYM_FUNC_END(__camellia_dec_blk32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  933) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  934) SYM_FUNC_START(camellia_ecb_enc_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  935) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  936) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  937) 	 *	%rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  938) 	 *	%rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  939) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  940) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  941) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  942) 	vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  943) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  944) 	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  945) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  946) 		     %ymm15, %rdx, (key_table)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  947) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  948) 	/* now dst can be used as temporary buffer (even in src == dst case) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  949) 	movq	%rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  950) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  951) 	call __camellia_enc_blk32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  952) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  953) 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  954) 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  955) 		     %ymm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  956) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  957) 	vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  958) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  959) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  960) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  961) SYM_FUNC_END(camellia_ecb_enc_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  962) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  963) SYM_FUNC_START(camellia_ecb_dec_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  964) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  965) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  966) 	 *	%rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  967) 	 *	%rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  968) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  969) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  970) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  971) 	vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  972) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  973) 	cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  974) 	movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  975) 	movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  976) 	cmovel %eax, %r8d; /* max */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  977) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  978) 	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  979) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  980) 		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  981) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  982) 	/* now dst can be used as temporary buffer (even in src == dst case) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  983) 	movq	%rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  984) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  985) 	call __camellia_dec_blk32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  986) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  987) 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  988) 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  989) 		     %ymm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  990) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  991) 	vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  992) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  993) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  994) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  995) SYM_FUNC_END(camellia_ecb_dec_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  996) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  997) SYM_FUNC_START(camellia_cbc_dec_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  998) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  999) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) 	 *	%rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) 	 *	%rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) 	vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) 	cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) 	movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) 	movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) 	cmovel %eax, %r8d; /* max */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) 	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) 		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) 		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) 	movq %rsp, %r10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) 	cmpq %rsi, %rdx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) 	je .Lcbc_dec_use_stack;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) 	/* dst can be used as temporary storage, src is not overwritten. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) 	movq %rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) 	jmp .Lcbc_dec_continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) .Lcbc_dec_use_stack:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) 	 * dst still in-use (because dst == src), so use stack for temporary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) 	 * storage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) 	subq $(16 * 32), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) 	movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) .Lcbc_dec_continue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) 	call __camellia_dec_blk32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) 	vmovdqu %ymm7, (%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) 	vpxor %ymm7, %ymm7, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) 	vinserti128 $1, (%rdx), %ymm7, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) 	vpxor (%rax), %ymm7, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) 	movq %r10, %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) 	vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) 	vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) 	vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) 	vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) 	vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) 	vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) 	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) 	vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) 	vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) 	vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) 	vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) 	vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) 	vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) 	vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) 	vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) 		     %ymm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) 	vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) SYM_FUNC_END(camellia_cbc_dec_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) #define inc_le128(x, minus_one, tmp) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) 	vpcmpeqq minus_one, x, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) 	vpsubq minus_one, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) 	vpslldq $8, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) 	vpsubq tmp, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) 	vpcmpeqq minus_one, x, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) 	vpcmpeqq minus_two, x, tmp2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) 	vpsubq minus_two, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) 	vpor tmp2, tmp1, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) 	vpslldq $8, tmp1, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) 	vpsubq tmp1, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) SYM_FUNC_START(camellia_ctr_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) 	 *	%rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) 	 *	%rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) 	 *	%rcx: iv (little endian, 128bit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) 	vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) 	movq %rsp, %r10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) 	cmpq %rsi, %rdx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) 	je .Lctr_use_stack;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) 	/* dst can be used as temporary storage, src is not overwritten. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) 	movq %rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) 	jmp .Lctr_continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) .Lctr_use_stack:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) 	subq $(16 * 32), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) 	movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) .Lctr_continue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) 	vpcmpeqd %ymm15, %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) 	vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) 	vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) 	/* load IV and byteswap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) 	vmovdqu (%rcx), %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) 	vmovdqa %xmm0, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) 	inc_le128(%xmm0, %xmm15, %xmm14);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) 	vbroadcasti128 .Lbswap128_mask, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) 	vinserti128 $1, %xmm0, %ymm1, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) 	vpshufb %ymm14, %ymm0, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) 	vmovdqu %ymm13, 15 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) 	/* construct IVs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) 	vpshufb %ymm14, %ymm0, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) 	vmovdqu %ymm13, 14 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) 	vpshufb %ymm14, %ymm0, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) 	vmovdqu %ymm13, 13 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) 	vpshufb %ymm14, %ymm0, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) 	vmovdqu %ymm13, 12 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) 	vpshufb %ymm14, %ymm0, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) 	vmovdqu %ymm13, 11 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) 	vpshufb %ymm14, %ymm0, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) 	vpshufb %ymm14, %ymm0, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) 	vpshufb %ymm14, %ymm0, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) 	vpshufb %ymm14, %ymm0, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) 	vpshufb %ymm14, %ymm0, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) 	vpshufb %ymm14, %ymm0, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) 	vpshufb %ymm14, %ymm0, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) 	vpshufb %ymm14, %ymm0, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) 	vpshufb %ymm14, %ymm0, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) 	vpshufb %ymm14, %ymm0, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) 	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) 	vextracti128 $1, %ymm0, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) 	vpshufb %ymm14, %ymm0, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) 	inc_le128(%xmm13, %xmm15, %xmm14);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) 	vmovdqu %xmm13, (%rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) 	/* inpack32_pre: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) 	vpbroadcastq (key_table)(CTX), %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) 	vpshufb .Lpack_bswap, %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) 	vpxor %ymm0, %ymm15, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) 	vpxor %ymm1, %ymm15, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) 	vpxor %ymm2, %ymm15, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) 	vpxor %ymm3, %ymm15, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) 	vpxor %ymm4, %ymm15, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) 	vpxor %ymm5, %ymm15, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) 	vpxor %ymm6, %ymm15, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) 	vpxor %ymm7, %ymm15, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) 	vpxor %ymm8, %ymm15, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) 	vpxor %ymm9, %ymm15, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) 	vpxor %ymm10, %ymm15, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) 	vpxor 11 * 32(%rax), %ymm15, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) 	vpxor 12 * 32(%rax), %ymm15, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) 	vpxor 13 * 32(%rax), %ymm15, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) 	call __camellia_enc_blk32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) 	movq %r10, %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) 	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) 	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) 	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) 	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) 	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) 	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) 	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) 	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) 	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) 	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) 	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) 	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) 	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) 	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) 	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) 	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) 		     %ymm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) 	vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) SYM_FUNC_END(camellia_ctr_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) #define gf128mul_x_ble(iv, mask, tmp) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) 	vpsrad $31, iv, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) 	vpaddq iv, iv, iv; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) 	vpshufd $0x13, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) 	vpand mask, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) 	vpxor tmp, iv, iv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) 	vpsrad $31, iv, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) 	vpaddq iv, iv, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) 	vpsllq $2, iv, iv; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) 	vpshufd $0x13, tmp0, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) 	vpsrad $31, tmp1, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) 	vpand mask2, tmp0, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) 	vpshufd $0x13, tmp1, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) 	vpxor tmp0, iv, iv; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) 	vpand mask1, tmp1, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) 	vpxor tmp1, iv, iv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) 	 *	%rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) 	 *	%rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) 	 *	%r8: index for input whitening key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) 	 *	%r9: pointer to  __camellia_enc_blk32 or __camellia_dec_blk32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) 	vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) 	subq $(16 * 32), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) 	movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) 	vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) 	/* load IV and construct second IV */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) 	vmovdqu (%rcx), %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) 	vmovdqa %xmm0, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) 	gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) 	vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) 	vinserti128 $1, %xmm0, %ymm15, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) 	vpxor 0 * 32(%rdx), %ymm0, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) 	vmovdqu %ymm15, 15 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) 	vmovdqu %ymm0, 0 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) 	/* construct IVs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) 	vpxor 1 * 32(%rdx), %ymm0, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) 	vmovdqu %ymm15, 14 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) 	vmovdqu %ymm0, 1 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) 	vpxor 2 * 32(%rdx), %ymm0, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) 	vmovdqu %ymm15, 13 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) 	vmovdqu %ymm0, 2 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) 	vpxor 3 * 32(%rdx), %ymm0, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) 	vmovdqu %ymm15, 12 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) 	vmovdqu %ymm0, 3 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) 	vpxor 4 * 32(%rdx), %ymm0, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) 	vmovdqu %ymm0, 4 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) 	vpxor 5 * 32(%rdx), %ymm0, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) 	vmovdqu %ymm0, 5 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) 	vpxor 6 * 32(%rdx), %ymm0, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) 	vmovdqu %ymm0, 6 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) 	vpxor 7 * 32(%rdx), %ymm0, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) 	vmovdqu %ymm0, 7 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) 	vpxor 8 * 32(%rdx), %ymm0, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) 	vmovdqu %ymm0, 8 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) 	vpxor 9 * 32(%rdx), %ymm0, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) 	vmovdqu %ymm0, 9 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) 	vpxor 10 * 32(%rdx), %ymm0, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) 	vmovdqu %ymm0, 10 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) 	vpxor 11 * 32(%rdx), %ymm0, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) 	vmovdqu %ymm0, 11 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) 	vpxor 12 * 32(%rdx), %ymm0, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) 	vmovdqu %ymm0, 12 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) 	vpxor 13 * 32(%rdx), %ymm0, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) 	vmovdqu %ymm0, 13 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) 	vpxor 14 * 32(%rdx), %ymm0, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) 	vmovdqu %ymm0, 14 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) 	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) 	vpxor 15 * 32(%rdx), %ymm0, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) 	vmovdqu %ymm15, 0 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) 	vmovdqu %ymm0, 15 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) 	vextracti128 $1, %ymm0, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) 	gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) 	vmovdqu %xmm0, (%rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) 	/* inpack32_pre: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) 	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) 	vpshufb .Lpack_bswap, %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) 	vpxor 0 * 32(%rax), %ymm15, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) 	vpxor %ymm1, %ymm15, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) 	vpxor %ymm2, %ymm15, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) 	vpxor %ymm3, %ymm15, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) 	vpxor %ymm4, %ymm15, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) 	vpxor %ymm5, %ymm15, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) 	vpxor %ymm6, %ymm15, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) 	vpxor %ymm7, %ymm15, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) 	vpxor %ymm8, %ymm15, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) 	vpxor %ymm9, %ymm15, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) 	vpxor %ymm10, %ymm15, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) 	vpxor %ymm11, %ymm15, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) 	vpxor 12 * 32(%rax), %ymm15, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) 	vpxor 13 * 32(%rax), %ymm15, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) 	CALL_NOSPEC r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) 	addq $(16 * 32), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) 	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) 	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) 	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) 	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) 	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) 	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) 	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) 	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) 	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) 	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) 	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) 	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) 	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) 	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) 	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) 	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) 		     %ymm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) 	vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) SYM_FUNC_END(camellia_xts_crypt_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) SYM_FUNC_START(camellia_xts_enc_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) 	 *	%rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) 	 *	%rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) 	xorl %r8d, %r8d; /* input whitening key, 0 for enc */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) 	leaq __camellia_enc_blk32, %r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) 	jmp camellia_xts_crypt_32way;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) SYM_FUNC_END(camellia_xts_enc_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) SYM_FUNC_START(camellia_xts_dec_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) 	 *	%rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) 	 *	%rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) 	cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) 	movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) 	movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) 	cmovel %eax, %r8d;  /* input whitening key, last for dec */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) 	leaq __camellia_dec_blk32, %r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) 	jmp camellia_xts_crypt_32way;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) SYM_FUNC_END(camellia_xts_dec_32way)