Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    1) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    2)  * x86_64/AVX/AES-NI assembler implementation of Camellia
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    3)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    4)  * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    5)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    6)  * This program is free software; you can redistribute it and/or modify
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    7)  * it under the terms of the GNU General Public License as published by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    8)  * the Free Software Foundation; either version 2 of the License, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    9)  * (at your option) any later version.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   10)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   11)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   12) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   13) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   14)  * Version licensed under 2-clause BSD License is available at:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   15)  *	http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   16)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   17) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   18) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   19) #include <asm/frame.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   20) #include <asm/nospec-branch.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   21) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   22) #define CAMELLIA_TABLE_BYTE_LEN 272
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   23) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   24) /* struct camellia_ctx: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   25) #define key_table 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   26) #define key_length CAMELLIA_TABLE_BYTE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   27) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   28) /* register macros */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   29) #define CTX %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   30) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   31) /**********************************************************************
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   32)   16-way camellia
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   33)  **********************************************************************/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   34) #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   35) 	vpand x, mask4bit, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   36) 	vpandn x, mask4bit, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   37) 	vpsrld $4, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   38) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   39) 	vpshufb tmp0, lo_t, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   40) 	vpshufb x, hi_t, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   41) 	vpxor tmp0, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   42) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   43) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   44)  * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   45)  *   x0..x7: byte-sliced AB state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   46)  *   mem_cd: register pointer storing CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   47)  *   key: index for key material
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   48)  * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   49)  *   x0..x7: new byte-sliced CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   50)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   51) #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   52) 		  t7, mem_cd, key) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   53) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   54) 	 * S-function with AES subbytes \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   55) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   56) 	vmovdqa .Linv_shift_row, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   57) 	vbroadcastss .L0f0f0f0f, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   58) 	vmovdqa .Lpre_tf_lo_s1, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   59) 	vmovdqa .Lpre_tf_hi_s1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   60) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   61) 	/* AES inverse shift rows */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   62) 	vpshufb t4, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   63) 	vpshufb t4, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   64) 	vpshufb t4, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   65) 	vpshufb t4, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   66) 	vpshufb t4, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   67) 	vpshufb t4, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   68) 	vpshufb t4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   69) 	vpshufb t4, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   70) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   71) 	/* prefilter sboxes 1, 2 and 3 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   72) 	vmovdqa .Lpre_tf_lo_s4, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   73) 	vmovdqa .Lpre_tf_hi_s4, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   74) 	filter_8bit(x0, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   75) 	filter_8bit(x7, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   76) 	filter_8bit(x1, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   77) 	filter_8bit(x4, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   78) 	filter_8bit(x2, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   79) 	filter_8bit(x5, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   80) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   81) 	/* prefilter sbox 4 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   82) 	vpxor t4, t4, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   83) 	filter_8bit(x3, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   84) 	filter_8bit(x6, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   85) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   86) 	/* AES subbytes + AES shift rows */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   87) 	vmovdqa .Lpost_tf_lo_s1, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   88) 	vmovdqa .Lpost_tf_hi_s1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   89) 	vaesenclast t4, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   90) 	vaesenclast t4, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   91) 	vaesenclast t4, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   92) 	vaesenclast t4, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   93) 	vaesenclast t4, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   94) 	vaesenclast t4, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   95) 	vaesenclast t4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   96) 	vaesenclast t4, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   97) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   98) 	/* postfilter sboxes 1 and 4 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   99) 	vmovdqa .Lpost_tf_lo_s3, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  100) 	vmovdqa .Lpost_tf_hi_s3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  101) 	filter_8bit(x0, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  102) 	filter_8bit(x7, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  103) 	filter_8bit(x3, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  104) 	filter_8bit(x6, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  105) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  106) 	/* postfilter sbox 3 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  107) 	vmovdqa .Lpost_tf_lo_s2, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  108) 	vmovdqa .Lpost_tf_hi_s2, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  109) 	filter_8bit(x2, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  110) 	filter_8bit(x5, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  111) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  112) 	vpxor t6, t6, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  113) 	vmovq key, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  114) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  115) 	/* postfilter sbox 2 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  116) 	filter_8bit(x1, t4, t5, t7, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  117) 	filter_8bit(x4, t4, t5, t7, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  118) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  119) 	vpsrldq $5, t0, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  120) 	vpsrldq $1, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  121) 	vpsrldq $2, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  122) 	vpsrldq $3, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  123) 	vpsrldq $4, t0, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  124) 	vpshufb t6, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  125) 	vpshufb t6, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  126) 	vpshufb t6, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  127) 	vpshufb t6, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  128) 	vpshufb t6, t4, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  129) 	vpsrldq $2, t5, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  130) 	vpshufb t6, t7, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  131) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  132) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  133) 	 * P-function \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  134) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  135) 	vpxor x5, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  136) 	vpxor x6, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  137) 	vpxor x7, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  138) 	vpxor x4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  139) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  140) 	vpxor x2, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  141) 	vpxor x3, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  142) 	vpxor x0, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  143) 	vpxor x1, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  144) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  145) 	vpxor x7, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  146) 	vpxor x4, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  147) 	vpxor x5, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  148) 	vpxor x6, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  149) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  150) 	vpxor x3, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  151) 	vpxor x0, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  152) 	vpxor x1, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  153) 	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  154) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  155) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  156) 	 * Add key material and result to CD (x becomes new CD) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  157) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  158) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  159) 	vpxor t3, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  160) 	vpxor 0 * 16(mem_cd), x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  161) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  162) 	vpxor t2, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  163) 	vpxor 1 * 16(mem_cd), x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  164) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  165) 	vpsrldq $1, t5, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  166) 	vpshufb t6, t5, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  167) 	vpshufb t6, t3, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  168) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  169) 	vpxor t1, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  170) 	vpxor 2 * 16(mem_cd), x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  171) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  172) 	vpxor t0, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  173) 	vpxor 3 * 16(mem_cd), x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  174) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  175) 	vpxor t7, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  176) 	vpxor 4 * 16(mem_cd), x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  177) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  178) 	vpxor t6, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  179) 	vpxor 5 * 16(mem_cd), x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  180) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  181) 	vpxor t5, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  182) 	vpxor 6 * 16(mem_cd), x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  183) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  184) 	vpxor t4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  185) 	vpxor 7 * 16(mem_cd), x3, x3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  186) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  187) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  188)  * Size optimization... with inlined roundsm16, binary would be over 5 times
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  189)  * larger and would only be 0.5% faster (on sandy-bridge).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  190)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  191) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  192) SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  193) 	roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  194) 		  %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  195) 		  %rcx, (%r9));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  196) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  197) SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  198) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  199) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  200) SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  201) 	roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  202) 		  %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  203) 		  %rax, (%r9));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  204) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  205) SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  206) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  207) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  208)  * IN/OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  209)  *  x0..x7: byte-sliced AB state preloaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  210)  *  mem_ab: byte-sliced AB state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  211)  *  mem_cb: byte-sliced CD state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  212)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  213) #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  214) 		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  215) 	leaq (key_table + (i) * 8)(CTX), %r9; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  216) 	call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  217) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  218) 	vmovdqu x4, 0 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  219) 	vmovdqu x5, 1 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  220) 	vmovdqu x6, 2 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  221) 	vmovdqu x7, 3 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  222) 	vmovdqu x0, 4 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  223) 	vmovdqu x1, 5 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  224) 	vmovdqu x2, 6 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  225) 	vmovdqu x3, 7 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  226) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  227) 	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  228) 	call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  229) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  230) 	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  231) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  232) #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  233) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  234) #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  235) 	/* Store new AB state */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  236) 	vmovdqu x0, 0 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  237) 	vmovdqu x1, 1 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  238) 	vmovdqu x2, 2 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  239) 	vmovdqu x3, 3 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  240) 	vmovdqu x4, 4 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  241) 	vmovdqu x5, 5 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  242) 	vmovdqu x6, 6 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  243) 	vmovdqu x7, 7 * 16(mem_ab);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  244) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  245) #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  246) 		      y6, y7, mem_ab, mem_cd, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  247) 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  248) 		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  249) 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  250) 		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  251) 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  252) 		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  253) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  254) #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  255) 		      y6, y7, mem_ab, mem_cd, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  256) 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  257) 		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  258) 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  259) 		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  260) 	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  261) 		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  262) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  263) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  264)  * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  265)  *  v0..3: byte-sliced 32-bit integers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  266)  * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  267)  *  v0..3: (IN <<< 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  268)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  269) #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  270) 	vpcmpgtb v0, zero, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  271) 	vpaddb v0, v0, v0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  272) 	vpabsb t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  273) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  274) 	vpcmpgtb v1, zero, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  275) 	vpaddb v1, v1, v1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  276) 	vpabsb t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  277) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  278) 	vpcmpgtb v2, zero, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  279) 	vpaddb v2, v2, v2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  280) 	vpabsb t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  281) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  282) 	vpor t0, v1, v1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  283) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  284) 	vpcmpgtb v3, zero, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  285) 	vpaddb v3, v3, v3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  286) 	vpabsb t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  287) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  288) 	vpor t1, v2, v2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  289) 	vpor t2, v3, v3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  290) 	vpor t0, v0, v0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  291) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  292) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  293)  * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  294)  *   r: byte-sliced AB state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  295)  *   l: byte-sliced CD state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  296)  * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  297)  *   x0..x7: new byte-sliced CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  298)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  299) #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  300) 	      tt1, tt2, tt3, kll, klr, krl, krr) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  301) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  302) 	 * t0 = kll; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  303) 	 * t0 &= ll; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  304) 	 * lr ^= rol32(t0, 1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  305) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  306) 	vpxor tt0, tt0, tt0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  307) 	vmovd kll, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  308) 	vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  309) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  310) 	vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  311) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  312) 	vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  313) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  314) 	vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  315) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  316) 	vpand l0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  317) 	vpand l1, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  318) 	vpand l2, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  319) 	vpand l3, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  320) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  321) 	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  322) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  323) 	vpxor l4, t0, l4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  324) 	vmovdqu l4, 4 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  325) 	vpxor l5, t1, l5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  326) 	vmovdqu l5, 5 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  327) 	vpxor l6, t2, l6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  328) 	vmovdqu l6, 6 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  329) 	vpxor l7, t3, l7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  330) 	vmovdqu l7, 7 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  331) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  332) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  333) 	 * t2 = krr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  334) 	 * t2 |= rr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  335) 	 * rl ^= t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  336) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  337) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  338) 	vmovd krr, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  339) 	vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  340) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  341) 	vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  342) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  343) 	vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  344) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  345) 	vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  346) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  347) 	vpor 4 * 16(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  348) 	vpor 5 * 16(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  349) 	vpor 6 * 16(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  350) 	vpor 7 * 16(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  351) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  352) 	vpxor 0 * 16(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  353) 	vpxor 1 * 16(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  354) 	vpxor 2 * 16(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  355) 	vpxor 3 * 16(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  356) 	vmovdqu t0, 0 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  357) 	vmovdqu t1, 1 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  358) 	vmovdqu t2, 2 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  359) 	vmovdqu t3, 3 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  360) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  361) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  362) 	 * t2 = krl; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  363) 	 * t2 &= rl; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  364) 	 * rr ^= rol32(t2, 1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  365) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  366) 	vmovd krl, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  367) 	vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  368) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  369) 	vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  370) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  371) 	vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  372) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  373) 	vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  374) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  375) 	vpand 0 * 16(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  376) 	vpand 1 * 16(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  377) 	vpand 2 * 16(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  378) 	vpand 3 * 16(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  379) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  380) 	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  381) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  382) 	vpxor 4 * 16(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  383) 	vpxor 5 * 16(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  384) 	vpxor 6 * 16(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  385) 	vpxor 7 * 16(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  386) 	vmovdqu t0, 4 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  387) 	vmovdqu t1, 5 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  388) 	vmovdqu t2, 6 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  389) 	vmovdqu t3, 7 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  390) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  391) 	/* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  392) 	 * t0 = klr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  393) 	 * t0 |= lr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  394) 	 * ll ^= t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  395) 	 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  396) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  397) 	vmovd klr, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  398) 	vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  399) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  400) 	vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  401) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  402) 	vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  403) 	vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  404) 	vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  405) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  406) 	vpor l4, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  407) 	vpor l5, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  408) 	vpor l6, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  409) 	vpor l7, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  410) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  411) 	vpxor l0, t0, l0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  412) 	vmovdqu l0, 0 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  413) 	vpxor l1, t1, l1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  414) 	vmovdqu l1, 1 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  415) 	vpxor l2, t2, l2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  416) 	vmovdqu l2, 2 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  417) 	vpxor l3, t3, l3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  418) 	vmovdqu l3, 3 * 16(l);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  419) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  420) #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  421) 	vpunpckhdq x1, x0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  422) 	vpunpckldq x1, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  423) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  424) 	vpunpckldq x3, x2, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  425) 	vpunpckhdq x3, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  426) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  427) 	vpunpckhqdq t1, x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  428) 	vpunpcklqdq t1, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  429) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  430) 	vpunpckhqdq x2, t2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  431) 	vpunpcklqdq x2, t2, x2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  432) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  433) #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  434) 			 b3, c3, d3, st0, st1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  435) 	vmovdqu d2, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  436) 	vmovdqu d3, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  437) 	transpose_4x4(a0, a1, a2, a3, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  438) 	transpose_4x4(b0, b1, b2, b3, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  439) 	vmovdqu st0, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  440) 	vmovdqu st1, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  441) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  442) 	vmovdqu a0, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  443) 	vmovdqu a1, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  444) 	transpose_4x4(c0, c1, c2, c3, a0, a1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  445) 	transpose_4x4(d0, d1, d2, d3, a0, a1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  446) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  447) 	vmovdqu .Lshufb_16x16b, a0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  448) 	vmovdqu st1, a1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  449) 	vpshufb a0, a2, a2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  450) 	vpshufb a0, a3, a3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  451) 	vpshufb a0, b0, b0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  452) 	vpshufb a0, b1, b1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  453) 	vpshufb a0, b2, b2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  454) 	vpshufb a0, b3, b3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  455) 	vpshufb a0, a1, a1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  456) 	vpshufb a0, c0, c0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  457) 	vpshufb a0, c1, c1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  458) 	vpshufb a0, c2, c2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  459) 	vpshufb a0, c3, c3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  460) 	vpshufb a0, d0, d0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  461) 	vpshufb a0, d1, d1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  462) 	vpshufb a0, d2, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  463) 	vpshufb a0, d3, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  464) 	vmovdqu d3, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  465) 	vmovdqu st0, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  466) 	vpshufb a0, d3, a0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  467) 	vmovdqu d2, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  468) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  469) 	transpose_4x4(a0, b0, c0, d0, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  470) 	transpose_4x4(a1, b1, c1, d1, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  471) 	vmovdqu st0, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  472) 	vmovdqu st1, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  473) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  474) 	vmovdqu b0, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  475) 	vmovdqu b1, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  476) 	transpose_4x4(a2, b2, c2, d2, b0, b1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  477) 	transpose_4x4(a3, b3, c3, d3, b0, b1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  478) 	vmovdqu st0, b0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  479) 	vmovdqu st1, b1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  480) 	/* does not adjust output bytes inside vectors */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  481) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  482) /* load blocks to registers and apply pre-whitening */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  483) #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  484) 		     y6, y7, rio, key) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  485) 	vmovq key, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  486) 	vpshufb .Lpack_bswap, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  487) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  488) 	vpxor 0 * 16(rio), x0, y7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  489) 	vpxor 1 * 16(rio), x0, y6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  490) 	vpxor 2 * 16(rio), x0, y5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  491) 	vpxor 3 * 16(rio), x0, y4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  492) 	vpxor 4 * 16(rio), x0, y3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  493) 	vpxor 5 * 16(rio), x0, y2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  494) 	vpxor 6 * 16(rio), x0, y1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  495) 	vpxor 7 * 16(rio), x0, y0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  496) 	vpxor 8 * 16(rio), x0, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  497) 	vpxor 9 * 16(rio), x0, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  498) 	vpxor 10 * 16(rio), x0, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  499) 	vpxor 11 * 16(rio), x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  500) 	vpxor 12 * 16(rio), x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  501) 	vpxor 13 * 16(rio), x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  502) 	vpxor 14 * 16(rio), x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  503) 	vpxor 15 * 16(rio), x0, x0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  504) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  505) /* byteslice pre-whitened blocks and store to temporary memory */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  506) #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  507) 		      y6, y7, mem_ab, mem_cd) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  508) 	byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  509) 			 y5, y6, y7, (mem_ab), (mem_cd)); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  510) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  511) 	vmovdqu x0, 0 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  512) 	vmovdqu x1, 1 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  513) 	vmovdqu x2, 2 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  514) 	vmovdqu x3, 3 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  515) 	vmovdqu x4, 4 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  516) 	vmovdqu x5, 5 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  517) 	vmovdqu x6, 6 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  518) 	vmovdqu x7, 7 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  519) 	vmovdqu y0, 0 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  520) 	vmovdqu y1, 1 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  521) 	vmovdqu y2, 2 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  522) 	vmovdqu y3, 3 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  523) 	vmovdqu y4, 4 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  524) 	vmovdqu y5, 5 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  525) 	vmovdqu y6, 6 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  526) 	vmovdqu y7, 7 * 16(mem_cd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  527) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  528) /* de-byteslice, apply post-whitening and store blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  529) #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  530) 		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  531) 	byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  532) 			 y7, x3, x7, stack_tmp0, stack_tmp1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  533) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  534) 	vmovdqu x0, stack_tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  535) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  536) 	vmovq key, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  537) 	vpshufb .Lpack_bswap, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  538) 	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  539) 	vpxor x0, y7, y7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  540) 	vpxor x0, y6, y6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  541) 	vpxor x0, y5, y5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  542) 	vpxor x0, y4, y4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  543) 	vpxor x0, y3, y3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  544) 	vpxor x0, y2, y2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  545) 	vpxor x0, y1, y1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  546) 	vpxor x0, y0, y0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  547) 	vpxor x0, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  548) 	vpxor x0, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  549) 	vpxor x0, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  550) 	vpxor x0, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  551) 	vpxor x0, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  552) 	vpxor x0, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  553) 	vpxor x0, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  554) 	vpxor stack_tmp0, x0, x0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  555) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  556) #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  557) 		     y6, y7, rio) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  558) 	vmovdqu x0, 0 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  559) 	vmovdqu x1, 1 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  560) 	vmovdqu x2, 2 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  561) 	vmovdqu x3, 3 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  562) 	vmovdqu x4, 4 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  563) 	vmovdqu x5, 5 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  564) 	vmovdqu x6, 6 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  565) 	vmovdqu x7, 7 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  566) 	vmovdqu y0, 8 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  567) 	vmovdqu y1, 9 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  568) 	vmovdqu y2, 10 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  569) 	vmovdqu y3, 11 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  570) 	vmovdqu y4, 12 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  571) 	vmovdqu y5, 13 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  572) 	vmovdqu y6, 14 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  573) 	vmovdqu y7, 15 * 16(rio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  574) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  575) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  576) /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  577) .section	.rodata.cst16, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  578) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  579) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  580) #define SHUFB_BYTES(idx) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  581) 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  582) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  583) .Lshufb_16x16b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  584) 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  585) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  586) .Lpack_bswap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  587) 	.long 0x00010203
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  588) 	.long 0x04050607
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  589) 	.long 0x80808080
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  590) 	.long 0x80808080
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  591) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  592) /* For CTR-mode IV byteswap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  593) .Lbswap128_mask:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  594) 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  595) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  596) /* For XTS mode IV generation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  597) .Lxts_gf128mul_and_shl1_mask:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  598) 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  599) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  600) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  601)  * pre-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  602)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  603)  * pre-lookup for sbox1, sbox2, sbox3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  604)  *   swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  605)  *       isom_map_camellia_to_aes(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  606)  *           camellia_f(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  607)  *               swap_bitendianess(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  608)  *           )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  609)  *       )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  610)  *   )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  611)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  612)  * (note: '⊕ 0xc5' inside camellia_f())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  613)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  614) .Lpre_tf_lo_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  615) 	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  616) 	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  617) .Lpre_tf_hi_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  618) 	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  619) 	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  620) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  621) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  622)  * pre-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  623)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  624)  * pre-lookup for sbox4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  625)  *   swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  626)  *       isom_map_camellia_to_aes(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  627)  *           camellia_f(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  628)  *               swap_bitendianess(in <<< 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  629)  *           )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  630)  *       )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  631)  *   )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  632)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  633)  * (note: '⊕ 0xc5' inside camellia_f())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  634)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  635) .Lpre_tf_lo_s4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  636) 	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  637) 	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  638) .Lpre_tf_hi_s4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  639) 	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  640) 	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  641) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  642) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  643)  * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  644)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  645)  * post-lookup for sbox1, sbox4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  646)  *  swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  647)  *      camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  648)  *          isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  649)  *              swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  650)  *                  aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  651)  *              )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  652)  *          )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  653)  *      )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  654)  *  )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  655)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  656)  * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  657)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  658) .Lpost_tf_lo_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  659) 	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  660) 	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  661) .Lpost_tf_hi_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  662) 	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  663) 	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  664) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  665) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  666)  * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  667)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  668)  * post-lookup for sbox2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  669)  *  swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  670)  *      camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  671)  *          isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  672)  *              swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  673)  *                  aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  674)  *              )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  675)  *          )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  676)  *      )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  677)  *  ) <<< 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  678)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  679)  * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  680)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  681) .Lpost_tf_lo_s2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  682) 	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  683) 	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  684) .Lpost_tf_hi_s2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  685) 	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  686) 	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  687) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  688) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  689)  * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  690)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  691)  * post-lookup for sbox3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  692)  *  swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  693)  *      camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  694)  *          isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  695)  *              swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  696)  *                  aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  697)  *              )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  698)  *          )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  699)  *      )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  700)  *  ) >>> 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  701)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  702)  * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  703)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  704) .Lpost_tf_lo_s3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  705) 	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  706) 	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  707) .Lpost_tf_hi_s3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  708) 	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  709) 	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  710) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  711) /* For isolating SubBytes from AESENCLAST, inverse shift row */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  712) .Linv_shift_row:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  713) 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  714) 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  715) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  716) /* 4-bit mask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  717) .section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  718) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  719) .L0f0f0f0f:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  720) 	.long 0x0f0f0f0f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  721) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  722) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  723) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  724) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  725) SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  726) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  727) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  728) 	 *	%rax: temporary storage, 256 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  729) 	 *	%xmm0..%xmm15: 16 plaintext blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  730) 	 * output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  731) 	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  732) 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  733) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  734) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  735) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  736) 	leaq 8 * 16(%rax), %rcx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  737) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  738) 	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  739) 		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  740) 		      %xmm15, %rax, %rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  741) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  742) 	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  743) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  744) 		     %xmm15, %rax, %rcx, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  745) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  746) 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  747) 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  748) 	      %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  749) 	      ((key_table + (8) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  750) 	      ((key_table + (8) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  751) 	      ((key_table + (8) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  752) 	      ((key_table + (8) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  753) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  754) 	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  755) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  756) 		     %xmm15, %rax, %rcx, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  757) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  758) 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  759) 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  760) 	      %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  761) 	      ((key_table + (16) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  762) 	      ((key_table + (16) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  763) 	      ((key_table + (16) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  764) 	      ((key_table + (16) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  765) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  766) 	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  767) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  768) 		     %xmm15, %rax, %rcx, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  769) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  770) 	movl $24, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  771) 	cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  772) 	jne .Lenc_max32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  773) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  774) .Lenc_done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  775) 	/* load CD for output */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  776) 	vmovdqu 0 * 16(%rcx), %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  777) 	vmovdqu 1 * 16(%rcx), %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  778) 	vmovdqu 2 * 16(%rcx), %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  779) 	vmovdqu 3 * 16(%rcx), %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  780) 	vmovdqu 4 * 16(%rcx), %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  781) 	vmovdqu 5 * 16(%rcx), %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  782) 	vmovdqu 6 * 16(%rcx), %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  783) 	vmovdqu 7 * 16(%rcx), %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  784) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  785) 	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  786) 		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  787) 		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  788) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  789) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  790) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  791) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  792) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  793) .Lenc_max32:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  794) 	movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  795) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  796) 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  797) 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  798) 	      %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  799) 	      ((key_table + (24) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  800) 	      ((key_table + (24) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  801) 	      ((key_table + (24) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  802) 	      ((key_table + (24) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  803) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  804) 	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  805) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  806) 		     %xmm15, %rax, %rcx, 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  807) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  808) 	jmp .Lenc_done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  809) SYM_FUNC_END(__camellia_enc_blk16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  810) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  811) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  812) SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  813) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  814) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  815) 	 *	%rax: temporary storage, 256 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  816) 	 *	%r8d: 24 for 16 byte key, 32 for larger
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  817) 	 *	%xmm0..%xmm15: 16 encrypted blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  818) 	 * output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  819) 	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  820) 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  821) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  822) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  823) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  824) 	leaq 8 * 16(%rax), %rcx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  825) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  826) 	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  827) 		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  828) 		      %xmm15, %rax, %rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  829) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  830) 	cmpl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  831) 	je .Ldec_max32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  832) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  833) .Ldec_max24:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  834) 	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  835) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  836) 		     %xmm15, %rax, %rcx, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  837) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  838) 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  839) 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  840) 	      %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  841) 	      ((key_table + (16) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  842) 	      ((key_table + (16) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  843) 	      ((key_table + (16) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  844) 	      ((key_table + (16) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  845) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  846) 	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  847) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  848) 		     %xmm15, %rax, %rcx, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  849) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  850) 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  851) 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  852) 	      %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  853) 	      ((key_table + (8) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  854) 	      ((key_table + (8) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  855) 	      ((key_table + (8) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  856) 	      ((key_table + (8) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  857) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  858) 	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  859) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  860) 		     %xmm15, %rax, %rcx, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  861) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  862) 	/* load CD for output */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  863) 	vmovdqu 0 * 16(%rcx), %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  864) 	vmovdqu 1 * 16(%rcx), %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  865) 	vmovdqu 2 * 16(%rcx), %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  866) 	vmovdqu 3 * 16(%rcx), %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  867) 	vmovdqu 4 * 16(%rcx), %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  868) 	vmovdqu 5 * 16(%rcx), %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  869) 	vmovdqu 6 * 16(%rcx), %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  870) 	vmovdqu 7 * 16(%rcx), %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  871) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  872) 	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  873) 		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  874) 		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  875) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  876) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  877) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  878) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  879) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  880) .Ldec_max32:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  881) 	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  882) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  883) 		     %xmm15, %rax, %rcx, 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  884) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  885) 	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  886) 	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  887) 	      %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  888) 	      ((key_table + (24) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  889) 	      ((key_table + (24) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  890) 	      ((key_table + (24) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  891) 	      ((key_table + (24) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  892) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  893) 	jmp .Ldec_max24;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  894) SYM_FUNC_END(__camellia_dec_blk16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  895) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  896) SYM_FUNC_START(camellia_ecb_enc_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  897) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  898) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  899) 	 *	%rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  900) 	 *	%rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  901) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  902) 	 FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  903) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  904) 	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  905) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  906) 		     %xmm15, %rdx, (key_table)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  907) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  908) 	/* now dst can be used as temporary buffer (even in src == dst case) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  909) 	movq	%rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  910) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  911) 	call __camellia_enc_blk16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  912) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  913) 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  914) 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  915) 		     %xmm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  916) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  917) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  918) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  919) SYM_FUNC_END(camellia_ecb_enc_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  920) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  921) SYM_FUNC_START(camellia_ecb_dec_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  922) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  923) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  924) 	 *	%rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  925) 	 *	%rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  926) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  927) 	 FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  928) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  929) 	cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  930) 	movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  931) 	movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  932) 	cmovel %eax, %r8d; /* max */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  933) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  934) 	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  935) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  936) 		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  937) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  938) 	/* now dst can be used as temporary buffer (even in src == dst case) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  939) 	movq	%rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  940) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  941) 	call __camellia_dec_blk16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  942) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  943) 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  944) 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  945) 		     %xmm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  946) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  947) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  948) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  949) SYM_FUNC_END(camellia_ecb_dec_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  950) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  951) SYM_FUNC_START(camellia_cbc_dec_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  952) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  953) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  954) 	 *	%rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  955) 	 *	%rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  956) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  957) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  958) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  959) 	cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  960) 	movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  961) 	movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  962) 	cmovel %eax, %r8d; /* max */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  963) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  964) 	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  965) 		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  966) 		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  967) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  968) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  969) 	 * dst might still be in-use (in case dst == src), so use stack for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  970) 	 * temporary storage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  971) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  972) 	subq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  973) 	movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  974) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  975) 	call __camellia_dec_blk16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  976) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  977) 	addq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  978) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  979) 	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  980) 	vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  981) 	vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  982) 	vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  983) 	vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  984) 	vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  985) 	vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  986) 	vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  987) 	vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  988) 	vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  989) 	vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  990) 	vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  991) 	vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  992) 	vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  993) 	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  994) 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  995) 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  996) 		     %xmm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  997) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  998) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  999) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) SYM_FUNC_END(camellia_cbc_dec_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) #define inc_le128(x, minus_one, tmp) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) 	vpcmpeqq minus_one, x, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) 	vpsubq minus_one, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) 	vpslldq $8, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) 	vpsubq tmp, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) SYM_FUNC_START(camellia_ctr_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) 	 *	%rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) 	 *	%rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) 	 *	%rcx: iv (little endian, 128bit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) 	subq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) 	movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) 	vmovdqa .Lbswap128_mask, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) 	/* load IV and byteswap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) 	vmovdqu (%rcx), %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) 	vpshufb %xmm14, %xmm0, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) 	vmovdqu %xmm15, 15 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) 	vpcmpeqd %xmm15, %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) 	vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) 	/* construct IVs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) 	vpshufb %xmm14, %xmm0, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) 	vmovdqu %xmm13, 14 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) 	vpshufb %xmm14, %xmm0, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) 	vmovdqu %xmm13, 13 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) 	vpshufb %xmm14, %xmm0, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) 	vpshufb %xmm14, %xmm0, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) 	vpshufb %xmm14, %xmm0, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) 	vpshufb %xmm14, %xmm0, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) 	vpshufb %xmm14, %xmm0, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) 	vpshufb %xmm14, %xmm0, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) 	vpshufb %xmm14, %xmm0, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) 	vpshufb %xmm14, %xmm0, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) 	vpshufb %xmm14, %xmm0, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) 	vpshufb %xmm14, %xmm0, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) 	vpshufb %xmm14, %xmm0, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) 	vpshufb %xmm14, %xmm0, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) 	inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) 	vmovdqa %xmm0, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) 	vpshufb %xmm14, %xmm0, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) 	inc_le128(%xmm13, %xmm15, %xmm14);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) 	vmovdqu %xmm13, (%rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) 	/* inpack16_pre: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) 	vmovq (key_table)(CTX), %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) 	vpshufb .Lpack_bswap, %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) 	vpxor %xmm0, %xmm15, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) 	vpxor %xmm1, %xmm15, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) 	vpxor %xmm2, %xmm15, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) 	vpxor %xmm3, %xmm15, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) 	vpxor %xmm4, %xmm15, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) 	vpxor %xmm5, %xmm15, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) 	vpxor %xmm6, %xmm15, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) 	vpxor %xmm7, %xmm15, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) 	vpxor %xmm8, %xmm15, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) 	vpxor %xmm9, %xmm15, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) 	vpxor %xmm10, %xmm15, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) 	vpxor %xmm11, %xmm15, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) 	vpxor %xmm12, %xmm15, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) 	vpxor 13 * 16(%rax), %xmm15, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) 	vpxor 14 * 16(%rax), %xmm15, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) 	call __camellia_enc_blk16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) 	addq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) 	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) 	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) 	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) 	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) 	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) 	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) 	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) 	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) 	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) 	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) 	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) 	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) 	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) 	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) 	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) 	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) 		     %xmm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) SYM_FUNC_END(camellia_ctr_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) #define gf128mul_x_ble(iv, mask, tmp) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) 	vpsrad $31, iv, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) 	vpaddq iv, iv, iv; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) 	vpshufd $0x13, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) 	vpand mask, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) 	vpxor tmp, iv, iv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) 	 *	%rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) 	 *	%rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) 	 *	%r8: index for input whitening key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) 	 *	%r9: pointer to  __camellia_enc_blk16 or __camellia_dec_blk16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) 	FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) 	subq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) 	movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) 	vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) 	/* load IV */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) 	vmovdqu (%rcx), %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) 	vpxor 0 * 16(%rdx), %xmm0, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) 	vmovdqu %xmm15, 15 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) 	vmovdqu %xmm0, 0 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) 	/* construct IVs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) 	vpxor 1 * 16(%rdx), %xmm0, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) 	vmovdqu %xmm15, 14 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) 	vmovdqu %xmm0, 1 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) 	vpxor 2 * 16(%rdx), %xmm0, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) 	vmovdqu %xmm0, 2 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) 	vpxor 3 * 16(%rdx), %xmm0, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) 	vmovdqu %xmm0, 3 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) 	vpxor 4 * 16(%rdx), %xmm0, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) 	vmovdqu %xmm0, 4 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) 	vpxor 5 * 16(%rdx), %xmm0, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) 	vmovdqu %xmm0, 5 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) 	vpxor 6 * 16(%rdx), %xmm0, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) 	vmovdqu %xmm0, 6 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) 	vpxor 7 * 16(%rdx), %xmm0, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) 	vmovdqu %xmm0, 7 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) 	vpxor 8 * 16(%rdx), %xmm0, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) 	vmovdqu %xmm0, 8 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) 	vpxor 9 * 16(%rdx), %xmm0, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) 	vmovdqu %xmm0, 9 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) 	vpxor 10 * 16(%rdx), %xmm0, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) 	vmovdqu %xmm0, 10 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) 	vpxor 11 * 16(%rdx), %xmm0, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) 	vmovdqu %xmm0, 11 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) 	vpxor 12 * 16(%rdx), %xmm0, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) 	vmovdqu %xmm0, 12 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) 	vpxor 13 * 16(%rdx), %xmm0, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) 	vmovdqu %xmm0, 13 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) 	vpxor 14 * 16(%rdx), %xmm0, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) 	vmovdqu %xmm0, 14 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) 	vpxor 15 * 16(%rdx), %xmm0, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) 	vmovdqu %xmm15, 0 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) 	vmovdqu %xmm0, 15 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) 	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) 	vmovdqu %xmm0, (%rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) 	/* inpack16_pre: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) 	vmovq (key_table)(CTX, %r8, 8), %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) 	vpshufb .Lpack_bswap, %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) 	vpxor 0 * 16(%rax), %xmm15, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) 	vpxor %xmm1, %xmm15, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) 	vpxor %xmm2, %xmm15, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) 	vpxor %xmm3, %xmm15, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) 	vpxor %xmm4, %xmm15, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) 	vpxor %xmm5, %xmm15, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) 	vpxor %xmm6, %xmm15, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) 	vpxor %xmm7, %xmm15, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) 	vpxor %xmm8, %xmm15, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) 	vpxor %xmm9, %xmm15, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) 	vpxor %xmm10, %xmm15, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) 	vpxor %xmm11, %xmm15, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) 	vpxor %xmm12, %xmm15, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) 	vpxor %xmm13, %xmm15, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) 	vpxor 14 * 16(%rax), %xmm15, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) 	CALL_NOSPEC r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) 	addq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) 	vpxor 0 * 16(%rsi), %xmm7, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) 	vpxor 1 * 16(%rsi), %xmm6, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) 	vpxor 2 * 16(%rsi), %xmm5, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) 	vpxor 3 * 16(%rsi), %xmm4, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) 	vpxor 4 * 16(%rsi), %xmm3, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) 	vpxor 5 * 16(%rsi), %xmm2, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) 	vpxor 6 * 16(%rsi), %xmm1, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) 	vpxor 7 * 16(%rsi), %xmm0, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) 	vpxor 8 * 16(%rsi), %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) 	vpxor 9 * 16(%rsi), %xmm14, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) 	vpxor 10 * 16(%rsi), %xmm13, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) 	vpxor 11 * 16(%rsi), %xmm12, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) 	vpxor 12 * 16(%rsi), %xmm11, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) 	vpxor 13 * 16(%rsi), %xmm10, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) 	vpxor 14 * 16(%rsi), %xmm9, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) 	vpxor 15 * 16(%rsi), %xmm8, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) 		     %xmm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) 	FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) 	ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) SYM_FUNC_END(camellia_xts_crypt_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) SYM_FUNC_START(camellia_xts_enc_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) 	 *	%rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) 	 *	%rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) 	xorl %r8d, %r8d; /* input whitening key, 0 for enc */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) 	leaq __camellia_enc_blk16, %r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) 	jmp camellia_xts_crypt_16way;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) SYM_FUNC_END(camellia_xts_enc_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) SYM_FUNC_START(camellia_xts_dec_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) 	/* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) 	 *	%rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) 	 *	%rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) 	 *	%rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) 	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) 	cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) 	movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) 	movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) 	cmovel %eax, %r8d;  /* input whitening key, last for dec */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) 	leaq __camellia_dec_blk16, %r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) 	jmp camellia_xts_crypt_16way;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) SYM_FUNC_END(camellia_xts_dec_16way)