^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0-or-later */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * x86_64/AVX2/AES-NI assembler implementation of Camellia
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <asm/frame.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <asm/nospec-branch.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #define CAMELLIA_TABLE_BYTE_LEN 272
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) /* struct camellia_ctx: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #define key_table 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #define key_length CAMELLIA_TABLE_BYTE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) /* register macros */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #define CTX %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #define RIO %r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) /**********************************************************************
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) helper macros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) **********************************************************************/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) vpand x, mask4bit, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) vpandn x, mask4bit, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) vpsrld $4, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) vpshufb tmp0, lo_t, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) vpshufb x, hi_t, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) vpxor tmp0, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #define ymm0_x xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #define ymm1_x xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #define ymm2_x xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #define ymm3_x xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #define ymm4_x xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #define ymm5_x xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #define ymm6_x xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #define ymm7_x xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #define ymm8_x xmm8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #define ymm9_x xmm9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #define ymm10_x xmm10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #define ymm11_x xmm11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #define ymm12_x xmm12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #define ymm13_x xmm13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #define ymm14_x xmm14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #define ymm15_x xmm15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) /**********************************************************************
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) 32-way camellia
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) **********************************************************************/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) * x0..x7: byte-sliced AB state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * mem_cd: register pointer storing CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) * key: index for key material
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) * x0..x7: new byte-sliced CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) t7, mem_cd, key) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * S-function with AES subbytes \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) vbroadcasti128 .Linv_shift_row, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) vpbroadcastd .L0f0f0f0f, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) vbroadcasti128 .Lpre_tf_lo_s1, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) vbroadcasti128 .Lpre_tf_hi_s1, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) vbroadcasti128 .Lpre_tf_lo_s4, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) vbroadcasti128 .Lpre_tf_hi_s4, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) /* AES inverse shift rows */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) vpshufb t4, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) vpshufb t4, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) vpshufb t4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) vpshufb t4, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) vpshufb t4, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) vpshufb t4, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) vpshufb t4, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) vpshufb t4, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) /* prefilter sboxes 1, 2 and 3 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) /* prefilter sbox 4 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) filter_8bit(x0, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) filter_8bit(x7, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) vextracti128 $1, x0, t0##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) vextracti128 $1, x7, t1##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) filter_8bit(x3, t2, t3, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) filter_8bit(x6, t2, t3, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) vextracti128 $1, x3, t3##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) vextracti128 $1, x6, t2##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) filter_8bit(x2, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) filter_8bit(x5, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) filter_8bit(x1, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) filter_8bit(x4, t5, t6, t7, t4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) vpxor t4##_x, t4##_x, t4##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) /* AES subbytes + AES shift rows */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) vextracti128 $1, x2, t6##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) vextracti128 $1, x5, t5##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) vaesenclast t4##_x, x0##_x, x0##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) vaesenclast t4##_x, t0##_x, t0##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) vinserti128 $1, t0##_x, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) vaesenclast t4##_x, x7##_x, x7##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) vaesenclast t4##_x, t1##_x, t1##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) vinserti128 $1, t1##_x, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) vaesenclast t4##_x, x3##_x, x3##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) vaesenclast t4##_x, t3##_x, t3##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) vinserti128 $1, t3##_x, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) vaesenclast t4##_x, x6##_x, x6##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) vaesenclast t4##_x, t2##_x, t2##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) vinserti128 $1, t2##_x, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) vextracti128 $1, x1, t3##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) vextracti128 $1, x4, t2##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) vbroadcasti128 .Lpost_tf_lo_s1, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) vbroadcasti128 .Lpost_tf_hi_s1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) vaesenclast t4##_x, x2##_x, x2##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) vaesenclast t4##_x, t6##_x, t6##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) vinserti128 $1, t6##_x, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) vaesenclast t4##_x, x5##_x, x5##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) vaesenclast t4##_x, t5##_x, t5##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) vinserti128 $1, t5##_x, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) vaesenclast t4##_x, x1##_x, x1##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) vaesenclast t4##_x, t3##_x, t3##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) vinserti128 $1, t3##_x, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) vaesenclast t4##_x, x4##_x, x4##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) vaesenclast t4##_x, t2##_x, t2##_x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) vinserti128 $1, t2##_x, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) /* postfilter sboxes 1 and 4 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) vbroadcasti128 .Lpost_tf_lo_s3, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) vbroadcasti128 .Lpost_tf_hi_s3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) filter_8bit(x0, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) filter_8bit(x7, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) filter_8bit(x3, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) filter_8bit(x6, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) /* postfilter sbox 3 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) vbroadcasti128 .Lpost_tf_lo_s2, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) vbroadcasti128 .Lpost_tf_hi_s2, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) filter_8bit(x2, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) filter_8bit(x5, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) /* postfilter sbox 2 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) filter_8bit(x1, t4, t5, t7, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) filter_8bit(x4, t4, t5, t7, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) vpxor t7, t7, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) vpsrldq $1, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) vpsrldq $2, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) vpshufb t7, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) vpsrldq $3, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) /* P-function */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) vpxor x5, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) vpxor x6, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) vpxor x7, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) vpxor x4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) vpshufb t7, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) vpsrldq $4, t0, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) vpshufb t7, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) vpsrldq $5, t0, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) vpshufb t7, t4, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) vpxor x2, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) vpxor x3, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) vpxor x0, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) vpxor x1, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) vpsrldq $6, t0, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) vpshufb t7, t5, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) vpshufb t7, t6, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) vpxor x7, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) vpxor x4, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) vpxor x5, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) vpxor x6, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) vpxor x3, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) vpxor x0, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) vpxor x1, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) vpxor x2, x7, x7; /* note: high and low parts swapped */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) /* Add key material and result to CD (x becomes new CD) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) vpxor t6, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) vpxor 5 * 32(mem_cd), x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) vpsrldq $7, t0, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) vpshufb t7, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) vpshufb t7, t6, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) vpxor t7, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) vpxor 4 * 32(mem_cd), x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) vpxor t5, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) vpxor 6 * 32(mem_cd), x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) vpxor t4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) vpxor 7 * 32(mem_cd), x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) vpxor t3, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) vpxor 0 * 32(mem_cd), x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) vpxor t2, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) vpxor 1 * 32(mem_cd), x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) vpxor t1, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) vpxor 2 * 32(mem_cd), x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) vpxor t0, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) vpxor 3 * 32(mem_cd), x7, x7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) * Size optimization... with inlined roundsm32 binary would be over 5 times
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) * larger and would only marginally faster.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) %rcx, (%r9));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) %rax, (%r9));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) * IN/OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) * x0..x7: byte-sliced AB state preloaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) * mem_ab: byte-sliced AB state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) * mem_cb: byte-sliced CD state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) leaq (key_table + (i) * 8)(CTX), %r9; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) vmovdqu x0, 4 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) vmovdqu x1, 5 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) vmovdqu x2, 6 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) vmovdqu x3, 7 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) vmovdqu x4, 0 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) vmovdqu x5, 1 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) vmovdqu x6, 2 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) vmovdqu x7, 3 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) /* Store new AB state */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) vmovdqu x4, 4 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) vmovdqu x5, 5 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) vmovdqu x6, 6 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) vmovdqu x7, 7 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) vmovdqu x0, 0 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) vmovdqu x1, 1 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) vmovdqu x2, 2 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) vmovdqu x3, 3 * 32(mem_ab);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) y6, y7, mem_ab, mem_cd, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) y6, y7, mem_ab, mem_cd, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) * v0..3: byte-sliced 32-bit integers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) * v0..3: (IN <<< 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) vpcmpgtb v0, zero, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) vpaddb v0, v0, v0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) vpabsb t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) vpcmpgtb v1, zero, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) vpaddb v1, v1, v1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) vpabsb t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) vpcmpgtb v2, zero, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) vpaddb v2, v2, v2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) vpabsb t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) vpor t0, v1, v1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) vpcmpgtb v3, zero, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) vpaddb v3, v3, v3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) vpabsb t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) vpor t1, v2, v2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) vpor t2, v3, v3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) vpor t0, v0, v0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) * r: byte-sliced AB state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) * l: byte-sliced CD state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) * x0..x7: new byte-sliced CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) tt1, tt2, tt3, kll, klr, krl, krr) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) * t0 = kll; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) * t0 &= ll; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) * lr ^= rol32(t0, 1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) vpxor tt0, tt0, tt0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) vpand l0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) vpand l1, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) vpand l2, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) vpand l3, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) vpxor l4, t0, l4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) vmovdqu l4, 4 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) vpxor l5, t1, l5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) vmovdqu l5, 5 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) vpxor l6, t2, l6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) vmovdqu l6, 6 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) vpxor l7, t3, l7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) vmovdqu l7, 7 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) * t2 = krr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) * t2 |= rr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) * rl ^= t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) vpor 4 * 32(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) vpor 5 * 32(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) vpor 6 * 32(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) vpor 7 * 32(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) vpxor 0 * 32(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) vpxor 1 * 32(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) vpxor 2 * 32(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) vpxor 3 * 32(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) vmovdqu t0, 0 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) vmovdqu t1, 1 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) vmovdqu t2, 2 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) vmovdqu t3, 3 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) * t2 = krl; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) * t2 &= rl; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) * rr ^= rol32(t2, 1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) vpand 0 * 32(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) vpand 1 * 32(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) vpand 2 * 32(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) vpand 3 * 32(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) vpxor 4 * 32(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) vpxor 5 * 32(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) vpxor 6 * 32(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) vpxor 7 * 32(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) vmovdqu t0, 4 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) vmovdqu t1, 5 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) vmovdqu t2, 6 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) vmovdqu t3, 7 * 32(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) * t0 = klr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) * t0 |= lr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) * ll ^= t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) vpor l4, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) vpor l5, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) vpor l6, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) vpor l7, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) vpxor l0, t0, l0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) vmovdqu l0, 0 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) vpxor l1, t1, l1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) vmovdqu l1, 1 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) vpxor l2, t2, l2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) vmovdqu l2, 2 * 32(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) vpxor l3, t3, l3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) vmovdqu l3, 3 * 32(l);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) vpunpckhdq x1, x0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) vpunpckldq x1, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) vpunpckldq x3, x2, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) vpunpckhdq x3, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) vpunpckhqdq t1, x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) vpunpcklqdq t1, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) vpunpckhqdq x2, t2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) vpunpcklqdq x2, t2, x2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) a3, b3, c3, d3, st0, st1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) vmovdqu d2, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) vmovdqu d3, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) transpose_4x4(a0, a1, a2, a3, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) transpose_4x4(b0, b1, b2, b3, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) vmovdqu st0, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) vmovdqu st1, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) vmovdqu a0, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) vmovdqu a1, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) transpose_4x4(c0, c1, c2, c3, a0, a1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) transpose_4x4(d0, d1, d2, d3, a0, a1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) vbroadcasti128 .Lshufb_16x16b, a0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) vmovdqu st1, a1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) vpshufb a0, a2, a2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) vpshufb a0, a3, a3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) vpshufb a0, b0, b0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) vpshufb a0, b1, b1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) vpshufb a0, b2, b2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) vpshufb a0, b3, b3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) vpshufb a0, a1, a1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) vpshufb a0, c0, c0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) vpshufb a0, c1, c1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) vpshufb a0, c2, c2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) vpshufb a0, c3, c3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) vpshufb a0, d0, d0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) vpshufb a0, d1, d1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) vpshufb a0, d2, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) vpshufb a0, d3, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) vmovdqu d3, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) vmovdqu st0, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) vpshufb a0, d3, a0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) vmovdqu d2, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) transpose_4x4(a0, b0, c0, d0, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) transpose_4x4(a1, b1, c1, d1, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) vmovdqu st0, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) vmovdqu st1, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) vmovdqu b0, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) vmovdqu b1, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) transpose_4x4(a2, b2, c2, d2, b0, b1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) transpose_4x4(a3, b3, c3, d3, b0, b1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) vmovdqu st0, b0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) vmovdqu st1, b1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) /* does not adjust output bytes inside vectors */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) /* load blocks to registers and apply pre-whitening */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) y6, y7, rio, key) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) vpbroadcastq key, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) vpshufb .Lpack_bswap, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) vpxor 0 * 32(rio), x0, y7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) vpxor 1 * 32(rio), x0, y6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) vpxor 2 * 32(rio), x0, y5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) vpxor 3 * 32(rio), x0, y4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) vpxor 4 * 32(rio), x0, y3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) vpxor 5 * 32(rio), x0, y2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) vpxor 6 * 32(rio), x0, y1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) vpxor 7 * 32(rio), x0, y0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) vpxor 8 * 32(rio), x0, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) vpxor 9 * 32(rio), x0, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) vpxor 10 * 32(rio), x0, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) vpxor 11 * 32(rio), x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) vpxor 12 * 32(rio), x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) vpxor 13 * 32(rio), x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) vpxor 14 * 32(rio), x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) vpxor 15 * 32(rio), x0, x0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) /* byteslice pre-whitened blocks and store to temporary memory */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) y6, y7, mem_ab, mem_cd) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) y4, y5, y6, y7, (mem_ab), (mem_cd)); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) vmovdqu x0, 0 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) vmovdqu x1, 1 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) vmovdqu x2, 2 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) vmovdqu x3, 3 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) vmovdqu x4, 4 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) vmovdqu x5, 5 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) vmovdqu x6, 6 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) vmovdqu x7, 7 * 32(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) vmovdqu y0, 0 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) vmovdqu y1, 1 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) vmovdqu y2, 2 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) vmovdqu y3, 3 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) vmovdqu y4, 4 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) vmovdqu y5, 5 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) vmovdqu y6, 6 * 32(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) vmovdqu y7, 7 * 32(mem_cd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) /* de-byteslice, apply post-whitening and store blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) y5, y6, y7, key, stack_tmp0, stack_tmp1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) vmovdqu x0, stack_tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) vpbroadcastq key, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) vpshufb .Lpack_bswap, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) vpxor x0, y7, y7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) vpxor x0, y6, y6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) vpxor x0, y5, y5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) vpxor x0, y4, y4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) vpxor x0, y3, y3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) vpxor x0, y2, y2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) vpxor x0, y1, y1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) vpxor x0, y0, y0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) vpxor x0, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) vpxor x0, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) vpxor x0, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) vpxor x0, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) vpxor x0, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) vpxor x0, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) vpxor x0, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) vpxor stack_tmp0, x0, x0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) y6, y7, rio) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) vmovdqu x0, 0 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) vmovdqu x1, 1 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) vmovdqu x2, 2 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) vmovdqu x3, 3 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) vmovdqu x4, 4 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) vmovdqu x5, 5 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) vmovdqu x6, 6 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) vmovdqu x7, 7 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) vmovdqu y0, 8 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) vmovdqu y1, 9 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) vmovdqu y2, 10 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) vmovdqu y3, 11 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) vmovdqu y4, 12 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) vmovdqu y5, 13 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) vmovdqu y6, 14 * 32(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) vmovdqu y7, 15 * 32(rio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) #define SHUFB_BYTES(idx) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) .Lshufb_16x16b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) .Lpack_bswap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) .section .rodata.cst16, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) /* For CTR-mode IV byteswap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) .Lbswap128_mask:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) /* For XTS mode */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) .Lxts_gf128mul_and_shl1_mask_0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) .Lxts_gf128mul_and_shl1_mask_1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) * pre-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) * pre-lookup for sbox1, sbox2, sbox3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) * isom_map_camellia_to_aes(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) * camellia_f(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) * swap_bitendianess(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) * (note: '⊕ 0xc5' inside camellia_f())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) .Lpre_tf_lo_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) .Lpre_tf_hi_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) * pre-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) * pre-lookup for sbox4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) * isom_map_camellia_to_aes(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) * camellia_f(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * swap_bitendianess(in <<< 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) * (note: '⊕ 0xc5' inside camellia_f())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) .Lpre_tf_lo_s4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) .Lpre_tf_hi_s4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) * post-lookup for sbox1, sbox4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) * camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) * isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) * aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) .Lpost_tf_lo_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) .Lpost_tf_hi_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) * post-lookup for sbox2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) * camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) * isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) * aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) * ) <<< 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) .Lpost_tf_lo_s2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) .Lpost_tf_hi_s2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) * post-lookup for sbox3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) * camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) * isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) * aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) * ) >>> 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) .Lpost_tf_lo_s3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) .Lpost_tf_hi_s3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) /* For isolating SubBytes from AESENCLAST, inverse shift row */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) .Linv_shift_row:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) /* 4-bit mask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) .L0f0f0f0f:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) .long 0x0f0f0f0f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) * %rax: temporary storage, 512 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) * %ymm0..%ymm15: 32 plaintext blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) * output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) leaq 8 * 32(%rax), %rcx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) %ymm15, %rax, %rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) %ymm15, %rax, %rcx, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) ((key_table + (8) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) ((key_table + (8) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) ((key_table + (8) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) ((key_table + (8) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) %ymm15, %rax, %rcx, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) ((key_table + (16) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) ((key_table + (16) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) ((key_table + (16) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) ((key_table + (16) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) %ymm15, %rax, %rcx, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) movl $24, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) jne .Lenc_max32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) .Lenc_done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) /* load CD for output */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) vmovdqu 0 * 32(%rcx), %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) vmovdqu 1 * 32(%rcx), %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) vmovdqu 2 * 32(%rcx), %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) vmovdqu 3 * 32(%rcx), %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) vmovdqu 4 * 32(%rcx), %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) vmovdqu 5 * 32(%rcx), %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) vmovdqu 6 * 32(%rcx), %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) vmovdqu 7 * 32(%rcx), %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) .Lenc_max32:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) ((key_table + (24) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) ((key_table + (24) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) ((key_table + (24) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) ((key_table + (24) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) %ymm15, %rax, %rcx, 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) jmp .Lenc_done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) SYM_FUNC_END(__camellia_enc_blk32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) * %rax: temporary storage, 512 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) * %r8d: 24 for 16 byte key, 32 for larger
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) * %ymm0..%ymm15: 16 encrypted blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) * output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) leaq 8 * 32(%rax), %rcx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) %ymm15, %rax, %rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) cmpl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) je .Ldec_max32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) .Ldec_max24:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) %ymm15, %rax, %rcx, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) ((key_table + (16) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) ((key_table + (16) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) ((key_table + (16) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) ((key_table + (16) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) %ymm15, %rax, %rcx, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) ((key_table + (8) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) ((key_table + (8) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) ((key_table + (8) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) ((key_table + (8) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) %ymm15, %rax, %rcx, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) /* load CD for output */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) vmovdqu 0 * 32(%rcx), %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) vmovdqu 1 * 32(%rcx), %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) vmovdqu 2 * 32(%rcx), %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) vmovdqu 3 * 32(%rcx), %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) vmovdqu 4 * 32(%rcx), %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) vmovdqu 5 * 32(%rcx), %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) vmovdqu 6 * 32(%rcx), %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) vmovdqu 7 * 32(%rcx), %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) .Ldec_max32:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) %ymm15, %rax, %rcx, 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) %ymm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) ((key_table + (24) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) ((key_table + (24) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) ((key_table + (24) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) ((key_table + (24) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) jmp .Ldec_max24;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) SYM_FUNC_END(__camellia_dec_blk32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) SYM_FUNC_START(camellia_ecb_enc_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) * %rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) * %rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) %ymm15, %rdx, (key_table)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) /* now dst can be used as temporary buffer (even in src == dst case) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) movq %rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) call __camellia_enc_blk32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) %ymm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) SYM_FUNC_END(camellia_ecb_enc_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) SYM_FUNC_START(camellia_ecb_dec_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) * %rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) * %rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) cmovel %eax, %r8d; /* max */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) %ymm15, %rdx, (key_table)(CTX, %r8, 8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) /* now dst can be used as temporary buffer (even in src == dst case) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) movq %rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) call __camellia_dec_blk32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) %ymm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) SYM_FUNC_END(camellia_ecb_dec_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) SYM_FUNC_START(camellia_cbc_dec_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) * %rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) * %rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) cmovel %eax, %r8d; /* max */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) %ymm15, %rdx, (key_table)(CTX, %r8, 8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) movq %rsp, %r10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) cmpq %rsi, %rdx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) je .Lcbc_dec_use_stack;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) /* dst can be used as temporary storage, src is not overwritten. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) movq %rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) jmp .Lcbc_dec_continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) .Lcbc_dec_use_stack:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) * dst still in-use (because dst == src), so use stack for temporary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) * storage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) subq $(16 * 32), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) .Lcbc_dec_continue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) call __camellia_dec_blk32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) vmovdqu %ymm7, (%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) vpxor %ymm7, %ymm7, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) vinserti128 $1, (%rdx), %ymm7, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) vpxor (%rax), %ymm7, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) movq %r10, %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) %ymm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) SYM_FUNC_END(camellia_cbc_dec_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) #define inc_le128(x, minus_one, tmp) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) vpcmpeqq minus_one, x, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) vpsubq minus_one, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) vpslldq $8, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) vpsubq tmp, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) vpcmpeqq minus_one, x, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) vpcmpeqq minus_two, x, tmp2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) vpsubq minus_two, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) vpor tmp2, tmp1, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) vpslldq $8, tmp1, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) vpsubq tmp1, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) SYM_FUNC_START(camellia_ctr_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) * %rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) * %rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) * %rcx: iv (little endian, 128bit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) movq %rsp, %r10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) cmpq %rsi, %rdx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) je .Lctr_use_stack;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) /* dst can be used as temporary storage, src is not overwritten. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) movq %rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) jmp .Lctr_continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) .Lctr_use_stack:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) subq $(16 * 32), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) .Lctr_continue:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) vpcmpeqd %ymm15, %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) /* load IV and byteswap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) vmovdqu (%rcx), %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) vmovdqa %xmm0, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) inc_le128(%xmm0, %xmm15, %xmm14);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) vbroadcasti128 .Lbswap128_mask, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) vinserti128 $1, %xmm0, %ymm1, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) vpshufb %ymm14, %ymm0, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) vmovdqu %ymm13, 15 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) /* construct IVs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) vpshufb %ymm14, %ymm0, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) vmovdqu %ymm13, 14 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) vpshufb %ymm14, %ymm0, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) vmovdqu %ymm13, 13 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) vpshufb %ymm14, %ymm0, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) vmovdqu %ymm13, 12 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) vpshufb %ymm14, %ymm0, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) vmovdqu %ymm13, 11 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) vpshufb %ymm14, %ymm0, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) vpshufb %ymm14, %ymm0, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) vpshufb %ymm14, %ymm0, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) vpshufb %ymm14, %ymm0, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) vpshufb %ymm14, %ymm0, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) vpshufb %ymm14, %ymm0, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) vpshufb %ymm14, %ymm0, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) vpshufb %ymm14, %ymm0, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) vpshufb %ymm14, %ymm0, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) vpshufb %ymm14, %ymm0, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) vextracti128 $1, %ymm0, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) vpshufb %ymm14, %ymm0, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) inc_le128(%xmm13, %xmm15, %xmm14);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) vmovdqu %xmm13, (%rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) /* inpack32_pre: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) vpbroadcastq (key_table)(CTX), %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) vpshufb .Lpack_bswap, %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) vpxor %ymm0, %ymm15, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) vpxor %ymm1, %ymm15, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) vpxor %ymm2, %ymm15, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) vpxor %ymm3, %ymm15, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) vpxor %ymm4, %ymm15, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) vpxor %ymm5, %ymm15, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) vpxor %ymm6, %ymm15, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) vpxor %ymm7, %ymm15, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) vpxor %ymm8, %ymm15, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) vpxor %ymm9, %ymm15, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) vpxor %ymm10, %ymm15, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) vpxor 11 * 32(%rax), %ymm15, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) vpxor 12 * 32(%rax), %ymm15, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) vpxor 13 * 32(%rax), %ymm15, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) vpxor 14 * 32(%rax), %ymm15, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) vpxor 15 * 32(%rax), %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) call __camellia_enc_blk32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) movq %r10, %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) vpxor 0 * 32(%rdx), %ymm7, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) vpxor 1 * 32(%rdx), %ymm6, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) vpxor 2 * 32(%rdx), %ymm5, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) vpxor 3 * 32(%rdx), %ymm4, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) vpxor 4 * 32(%rdx), %ymm3, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) vpxor 5 * 32(%rdx), %ymm2, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) vpxor 6 * 32(%rdx), %ymm1, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) vpxor 7 * 32(%rdx), %ymm0, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) vpxor 8 * 32(%rdx), %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) vpxor 9 * 32(%rdx), %ymm14, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) vpxor 10 * 32(%rdx), %ymm13, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) vpxor 11 * 32(%rdx), %ymm12, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) vpxor 12 * 32(%rdx), %ymm11, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) vpxor 13 * 32(%rdx), %ymm10, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) vpxor 14 * 32(%rdx), %ymm9, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) vpxor 15 * 32(%rdx), %ymm8, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) %ymm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) SYM_FUNC_END(camellia_ctr_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) #define gf128mul_x_ble(iv, mask, tmp) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) vpsrad $31, iv, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) vpaddq iv, iv, iv; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) vpshufd $0x13, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) vpand mask, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) vpxor tmp, iv, iv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) vpsrad $31, iv, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) vpaddq iv, iv, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) vpsllq $2, iv, iv; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) vpshufd $0x13, tmp0, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) vpsrad $31, tmp1, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) vpand mask2, tmp0, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) vpshufd $0x13, tmp1, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) vpxor tmp0, iv, iv; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) vpand mask1, tmp1, tmp1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) vpxor tmp1, iv, iv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) * %rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) * %rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) * %r8: index for input whitening key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) subq $(16 * 32), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) /* load IV and construct second IV */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) vmovdqu (%rcx), %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) vmovdqa %xmm0, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) vinserti128 $1, %xmm0, %ymm15, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) vpxor 0 * 32(%rdx), %ymm0, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) vmovdqu %ymm15, 15 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) vmovdqu %ymm0, 0 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) /* construct IVs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) vpxor 1 * 32(%rdx), %ymm0, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) vmovdqu %ymm15, 14 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) vmovdqu %ymm0, 1 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) vpxor 2 * 32(%rdx), %ymm0, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) vmovdqu %ymm15, 13 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) vmovdqu %ymm0, 2 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) vpxor 3 * 32(%rdx), %ymm0, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) vmovdqu %ymm15, 12 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) vmovdqu %ymm0, 3 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) vpxor 4 * 32(%rdx), %ymm0, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) vmovdqu %ymm0, 4 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) vpxor 5 * 32(%rdx), %ymm0, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) vmovdqu %ymm0, 5 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) vpxor 6 * 32(%rdx), %ymm0, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) vmovdqu %ymm0, 6 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) vpxor 7 * 32(%rdx), %ymm0, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) vmovdqu %ymm0, 7 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) vpxor 8 * 32(%rdx), %ymm0, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) vmovdqu %ymm0, 8 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) vpxor 9 * 32(%rdx), %ymm0, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) vmovdqu %ymm0, 9 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) vpxor 10 * 32(%rdx), %ymm0, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) vmovdqu %ymm0, 10 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) vpxor 11 * 32(%rdx), %ymm0, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) vmovdqu %ymm0, 11 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) vpxor 12 * 32(%rdx), %ymm0, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) vmovdqu %ymm0, 12 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) vpxor 13 * 32(%rdx), %ymm0, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) vmovdqu %ymm0, 13 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) vpxor 14 * 32(%rdx), %ymm0, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) vmovdqu %ymm0, 14 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) vpxor 15 * 32(%rdx), %ymm0, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) vmovdqu %ymm15, 0 * 32(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) vmovdqu %ymm0, 15 * 32(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) vextracti128 $1, %ymm0, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) vmovdqu %xmm0, (%rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) /* inpack32_pre: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) vpshufb .Lpack_bswap, %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) vpxor 0 * 32(%rax), %ymm15, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) vpxor %ymm1, %ymm15, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) vpxor %ymm2, %ymm15, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) vpxor %ymm3, %ymm15, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) vpxor %ymm4, %ymm15, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) vpxor %ymm5, %ymm15, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) vpxor %ymm6, %ymm15, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) vpxor %ymm7, %ymm15, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) vpxor %ymm8, %ymm15, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) vpxor %ymm9, %ymm15, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) vpxor %ymm10, %ymm15, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) vpxor %ymm11, %ymm15, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) vpxor 12 * 32(%rax), %ymm15, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) vpxor 13 * 32(%rax), %ymm15, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) vpxor 14 * 32(%rax), %ymm15, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) vpxor 15 * 32(%rax), %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) CALL_NOSPEC r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) addq $(16 * 32), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) vpxor 0 * 32(%rsi), %ymm7, %ymm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) vpxor 1 * 32(%rsi), %ymm6, %ymm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) vpxor 2 * 32(%rsi), %ymm5, %ymm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) vpxor 3 * 32(%rsi), %ymm4, %ymm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) vpxor 4 * 32(%rsi), %ymm3, %ymm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) vpxor 5 * 32(%rsi), %ymm2, %ymm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) vpxor 6 * 32(%rsi), %ymm1, %ymm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) vpxor 7 * 32(%rsi), %ymm0, %ymm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) vpxor 8 * 32(%rsi), %ymm15, %ymm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) vpxor 9 * 32(%rsi), %ymm14, %ymm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) vpxor 10 * 32(%rsi), %ymm13, %ymm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) vpxor 11 * 32(%rsi), %ymm12, %ymm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) vpxor 12 * 32(%rsi), %ymm11, %ymm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) vpxor 13 * 32(%rsi), %ymm10, %ymm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) vpxor 14 * 32(%rsi), %ymm9, %ymm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) vpxor 15 * 32(%rsi), %ymm8, %ymm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) %ymm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) vzeroupper;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) SYM_FUNC_END(camellia_xts_crypt_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) SYM_FUNC_START(camellia_xts_enc_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) * %rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) * %rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) xorl %r8d, %r8d; /* input whitening key, 0 for enc */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) leaq __camellia_enc_blk32, %r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) jmp camellia_xts_crypt_32way;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) SYM_FUNC_END(camellia_xts_enc_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) SYM_FUNC_START(camellia_xts_dec_32way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) * %rsi: dst (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) * %rdx: src (32 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) cmovel %eax, %r8d; /* input whitening key, last for dec */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) leaq __camellia_dec_blk32, %r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) jmp camellia_xts_crypt_32way;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) SYM_FUNC_END(camellia_xts_dec_32way)