^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) * x86_64/AVX/AES-NI assembler implementation of Camellia
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * This program is free software; you can redistribute it and/or modify
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * it under the terms of the GNU General Public License as published by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * the Free Software Foundation; either version 2 of the License, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * (at your option) any later version.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * Version licensed under 2-clause BSD License is available at:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <asm/frame.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <asm/nospec-branch.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #define CAMELLIA_TABLE_BYTE_LEN 272
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) /* struct camellia_ctx: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #define key_table 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #define key_length CAMELLIA_TABLE_BYTE_LEN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) /* register macros */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #define CTX %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) /**********************************************************************
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) 16-way camellia
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) **********************************************************************/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) vpand x, mask4bit, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) vpandn x, mask4bit, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) vpsrld $4, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) vpshufb tmp0, lo_t, tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) vpshufb x, hi_t, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) vpxor tmp0, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * x0..x7: byte-sliced AB state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * mem_cd: register pointer storing CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * key: index for key material
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * x0..x7: new byte-sliced CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) t7, mem_cd, key) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * S-function with AES subbytes \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) vmovdqa .Linv_shift_row, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) vbroadcastss .L0f0f0f0f, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) vmovdqa .Lpre_tf_lo_s1, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) vmovdqa .Lpre_tf_hi_s1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) /* AES inverse shift rows */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) vpshufb t4, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) vpshufb t4, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) vpshufb t4, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) vpshufb t4, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) vpshufb t4, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) vpshufb t4, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) vpshufb t4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) vpshufb t4, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) /* prefilter sboxes 1, 2 and 3 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) vmovdqa .Lpre_tf_lo_s4, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) vmovdqa .Lpre_tf_hi_s4, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) filter_8bit(x0, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) filter_8bit(x7, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) filter_8bit(x1, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) filter_8bit(x4, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) filter_8bit(x2, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) filter_8bit(x5, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) /* prefilter sbox 4 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) vpxor t4, t4, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) filter_8bit(x3, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) filter_8bit(x6, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) /* AES subbytes + AES shift rows */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) vmovdqa .Lpost_tf_lo_s1, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) vmovdqa .Lpost_tf_hi_s1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) vaesenclast t4, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) vaesenclast t4, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) vaesenclast t4, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) vaesenclast t4, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) vaesenclast t4, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) vaesenclast t4, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) vaesenclast t4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) vaesenclast t4, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) /* postfilter sboxes 1 and 4 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) vmovdqa .Lpost_tf_lo_s3, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) vmovdqa .Lpost_tf_hi_s3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) filter_8bit(x0, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) filter_8bit(x7, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) filter_8bit(x3, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) filter_8bit(x6, t0, t1, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) /* postfilter sbox 3 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) vmovdqa .Lpost_tf_lo_s2, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) vmovdqa .Lpost_tf_hi_s2, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) filter_8bit(x2, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) filter_8bit(x5, t2, t3, t7, t6); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) vpxor t6, t6, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) vmovq key, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) /* postfilter sbox 2 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) filter_8bit(x1, t4, t5, t7, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) filter_8bit(x4, t4, t5, t7, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) vpsrldq $5, t0, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) vpsrldq $1, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) vpsrldq $2, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) vpsrldq $3, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) vpsrldq $4, t0, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) vpshufb t6, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) vpshufb t6, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) vpshufb t6, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) vpshufb t6, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) vpshufb t6, t4, t4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) vpsrldq $2, t5, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) vpshufb t6, t7, t7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) * P-function \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) vpxor x5, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) vpxor x6, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) vpxor x7, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) vpxor x4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) vpxor x2, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) vpxor x3, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) vpxor x0, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) vpxor x1, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) vpxor x7, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) vpxor x4, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) vpxor x5, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) vpxor x6, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) vpxor x3, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) vpxor x0, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) vpxor x1, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) vpxor x2, x7, x7; /* note: high and low parts swapped */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) * Add key material and result to CD (x becomes new CD) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) vpxor t3, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) vpxor 0 * 16(mem_cd), x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) vpxor t2, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) vpxor 1 * 16(mem_cd), x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) vpsrldq $1, t5, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) vpshufb t6, t5, t5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) vpshufb t6, t3, t6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) vpxor t1, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) vpxor 2 * 16(mem_cd), x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) vpxor t0, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) vpxor 3 * 16(mem_cd), x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) vpxor t7, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) vpxor 4 * 16(mem_cd), x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) vpxor t6, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) vpxor 5 * 16(mem_cd), x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) vpxor t5, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) vpxor 6 * 16(mem_cd), x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) vpxor t4, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) vpxor 7 * 16(mem_cd), x3, x3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) * Size optimization... with inlined roundsm16, binary would be over 5 times
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) * larger and would only be 0.5% faster (on sandy-bridge).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) %rcx, (%r9));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) %rax, (%r9));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) * IN/OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) * x0..x7: byte-sliced AB state preloaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) * mem_ab: byte-sliced AB state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) * mem_cb: byte-sliced CD state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) leaq (key_table + (i) * 8)(CTX), %r9; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) vmovdqu x4, 0 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) vmovdqu x5, 1 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) vmovdqu x6, 2 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) vmovdqu x7, 3 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) vmovdqu x0, 4 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) vmovdqu x1, 5 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) vmovdqu x2, 6 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) vmovdqu x3, 7 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) /* Store new AB state */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) vmovdqu x0, 0 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) vmovdqu x1, 1 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) vmovdqu x2, 2 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) vmovdqu x3, 3 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) vmovdqu x4, 4 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) vmovdqu x5, 5 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) vmovdqu x6, 6 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) vmovdqu x7, 7 * 16(mem_ab);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) y6, y7, mem_ab, mem_cd, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) y6, y7, mem_ab, mem_cd, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) * v0..3: byte-sliced 32-bit integers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) * v0..3: (IN <<< 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) vpcmpgtb v0, zero, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) vpaddb v0, v0, v0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) vpabsb t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) vpcmpgtb v1, zero, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) vpaddb v1, v1, v1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) vpabsb t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) vpcmpgtb v2, zero, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) vpaddb v2, v2, v2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) vpabsb t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) vpor t0, v1, v1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) vpcmpgtb v3, zero, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) vpaddb v3, v3, v3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) vpabsb t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) vpor t1, v2, v2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) vpor t2, v3, v3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) vpor t0, v0, v0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) * IN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) * r: byte-sliced AB state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) * l: byte-sliced CD state in memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) * OUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) * x0..x7: new byte-sliced CD state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) tt1, tt2, tt3, kll, klr, krl, krr) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) * t0 = kll; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) * t0 &= ll; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) * lr ^= rol32(t0, 1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) vpxor tt0, tt0, tt0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) vmovd kll, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) vpand l0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) vpand l1, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) vpand l2, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) vpand l3, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) vpxor l4, t0, l4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) vmovdqu l4, 4 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) vpxor l5, t1, l5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) vmovdqu l5, 5 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) vpxor l6, t2, l6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) vmovdqu l6, 6 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) vpxor l7, t3, l7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) vmovdqu l7, 7 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) * t2 = krr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) * t2 |= rr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) * rl ^= t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) vmovd krr, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) vpor 4 * 16(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) vpor 5 * 16(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) vpor 6 * 16(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) vpor 7 * 16(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) vpxor 0 * 16(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) vpxor 1 * 16(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) vpxor 2 * 16(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) vpxor 3 * 16(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) vmovdqu t0, 0 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) vmovdqu t1, 1 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) vmovdqu t2, 2 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) vmovdqu t3, 3 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) * t2 = krl; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) * t2 &= rl; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) * rr ^= rol32(t2, 1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) vmovd krl, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) vpand 0 * 16(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) vpand 1 * 16(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) vpand 2 * 16(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) vpand 3 * 16(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) vpxor 4 * 16(r), t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) vpxor 5 * 16(r), t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) vpxor 6 * 16(r), t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) vpxor 7 * 16(r), t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) vmovdqu t0, 4 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) vmovdqu t1, 5 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) vmovdqu t2, 6 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) vmovdqu t3, 7 * 16(r); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) /* \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) * t0 = klr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * t0 |= lr; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) * ll ^= t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) vmovd klr, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) vpshufb tt0, t0, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) vpshufb tt0, t0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) vpshufb tt0, t0, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) vpsrldq $1, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) vpshufb tt0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) vpor l4, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) vpor l5, t1, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) vpor l6, t2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) vpor l7, t3, t3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) vpxor l0, t0, l0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) vmovdqu l0, 0 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) vpxor l1, t1, l1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) vmovdqu l1, 1 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) vpxor l2, t2, l2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) vmovdqu l2, 2 * 16(l); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) vpxor l3, t3, l3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) vmovdqu l3, 3 * 16(l);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) vpunpckhdq x1, x0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) vpunpckldq x1, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) vpunpckldq x3, x2, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) vpunpckhdq x3, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) vpunpckhqdq t1, x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) vpunpcklqdq t1, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) vpunpckhqdq x2, t2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) vpunpcklqdq x2, t2, x2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) b3, c3, d3, st0, st1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) vmovdqu d2, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) vmovdqu d3, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) transpose_4x4(a0, a1, a2, a3, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) transpose_4x4(b0, b1, b2, b3, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) vmovdqu st0, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) vmovdqu st1, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) vmovdqu a0, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) vmovdqu a1, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) transpose_4x4(c0, c1, c2, c3, a0, a1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) transpose_4x4(d0, d1, d2, d3, a0, a1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) vmovdqu .Lshufb_16x16b, a0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) vmovdqu st1, a1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) vpshufb a0, a2, a2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) vpshufb a0, a3, a3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) vpshufb a0, b0, b0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) vpshufb a0, b1, b1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) vpshufb a0, b2, b2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) vpshufb a0, b3, b3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) vpshufb a0, a1, a1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) vpshufb a0, c0, c0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) vpshufb a0, c1, c1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) vpshufb a0, c2, c2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) vpshufb a0, c3, c3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) vpshufb a0, d0, d0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) vpshufb a0, d1, d1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) vpshufb a0, d2, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) vpshufb a0, d3, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) vmovdqu d3, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) vmovdqu st0, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) vpshufb a0, d3, a0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) vmovdqu d2, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) transpose_4x4(a0, b0, c0, d0, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) transpose_4x4(a1, b1, c1, d1, d2, d3); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) vmovdqu st0, d2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) vmovdqu st1, d3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) vmovdqu b0, st0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) vmovdqu b1, st1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) transpose_4x4(a2, b2, c2, d2, b0, b1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) transpose_4x4(a3, b3, c3, d3, b0, b1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) vmovdqu st0, b0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) vmovdqu st1, b1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) /* does not adjust output bytes inside vectors */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) /* load blocks to registers and apply pre-whitening */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) y6, y7, rio, key) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) vmovq key, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) vpshufb .Lpack_bswap, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) vpxor 0 * 16(rio), x0, y7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) vpxor 1 * 16(rio), x0, y6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) vpxor 2 * 16(rio), x0, y5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) vpxor 3 * 16(rio), x0, y4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) vpxor 4 * 16(rio), x0, y3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) vpxor 5 * 16(rio), x0, y2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) vpxor 6 * 16(rio), x0, y1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) vpxor 7 * 16(rio), x0, y0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) vpxor 8 * 16(rio), x0, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) vpxor 9 * 16(rio), x0, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) vpxor 10 * 16(rio), x0, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) vpxor 11 * 16(rio), x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) vpxor 12 * 16(rio), x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) vpxor 13 * 16(rio), x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) vpxor 14 * 16(rio), x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) vpxor 15 * 16(rio), x0, x0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) /* byteslice pre-whitened blocks and store to temporary memory */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) y6, y7, mem_ab, mem_cd) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) y5, y6, y7, (mem_ab), (mem_cd)); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) vmovdqu x0, 0 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) vmovdqu x1, 1 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) vmovdqu x2, 2 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) vmovdqu x3, 3 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) vmovdqu x4, 4 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) vmovdqu x5, 5 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) vmovdqu x6, 6 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) vmovdqu x7, 7 * 16(mem_ab); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) vmovdqu y0, 0 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) vmovdqu y1, 1 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) vmovdqu y2, 2 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) vmovdqu y3, 3 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) vmovdqu y4, 4 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) vmovdqu y5, 5 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) vmovdqu y6, 6 * 16(mem_cd); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) vmovdqu y7, 7 * 16(mem_cd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) /* de-byteslice, apply post-whitening and store blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) y5, y6, y7, key, stack_tmp0, stack_tmp1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) y7, x3, x7, stack_tmp0, stack_tmp1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) vmovdqu x0, stack_tmp0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) vmovq key, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) vpshufb .Lpack_bswap, x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) vpxor x0, y7, y7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) vpxor x0, y6, y6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) vpxor x0, y5, y5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) vpxor x0, y4, y4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) vpxor x0, y3, y3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) vpxor x0, y2, y2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) vpxor x0, y1, y1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) vpxor x0, y0, y0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) vpxor x0, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) vpxor x0, x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) vpxor x0, x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) vpxor x0, x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) vpxor x0, x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) vpxor x0, x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) vpxor x0, x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) vpxor stack_tmp0, x0, x0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) y6, y7, rio) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) vmovdqu x0, 0 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) vmovdqu x1, 1 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) vmovdqu x2, 2 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) vmovdqu x3, 3 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) vmovdqu x4, 4 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) vmovdqu x5, 5 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) vmovdqu x6, 6 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) vmovdqu x7, 7 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) vmovdqu y0, 8 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) vmovdqu y1, 9 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) vmovdqu y2, 10 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) vmovdqu y3, 11 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) vmovdqu y4, 12 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) vmovdqu y5, 13 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) vmovdqu y6, 14 * 16(rio); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) vmovdqu y7, 15 * 16(rio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) .section .rodata.cst16, "aM", @progbits, 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) #define SHUFB_BYTES(idx) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) .Lshufb_16x16b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) .Lpack_bswap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) .long 0x00010203
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) .long 0x04050607
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) .long 0x80808080
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) .long 0x80808080
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) /* For CTR-mode IV byteswap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) .Lbswap128_mask:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) /* For XTS mode IV generation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) .Lxts_gf128mul_and_shl1_mask:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) * pre-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) * pre-lookup for sbox1, sbox2, sbox3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) * isom_map_camellia_to_aes(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) * camellia_f(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) * swap_bitendianess(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) * (note: '⊕ 0xc5' inside camellia_f())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) .Lpre_tf_lo_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) .Lpre_tf_hi_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) * pre-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) * pre-lookup for sbox4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) * isom_map_camellia_to_aes(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) * camellia_f(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) * swap_bitendianess(in <<< 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) * (note: '⊕ 0xc5' inside camellia_f())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) .Lpre_tf_lo_s4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) .Lpre_tf_hi_s4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) * post-lookup for sbox1, sbox4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) * camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) * isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) * aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) .Lpost_tf_lo_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) .Lpost_tf_hi_s1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) * post-lookup for sbox2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) * camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) * isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) * aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) * ) <<< 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) .Lpost_tf_lo_s2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) .Lpost_tf_hi_s2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) * post-SubByte transform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) * post-lookup for sbox3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) * camellia_h(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) * isom_map_aes_to_camellia(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) * swap_bitendianness(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) * aes_inverse_affine_transform(in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) * )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) * ) >>> 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) * (note: '⊕ 0x6e' inside camellia_h())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) .Lpost_tf_lo_s3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) .Lpost_tf_hi_s3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) /* For isolating SubBytes from AESENCLAST, inverse shift row */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) .Linv_shift_row:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) /* 4-bit mask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) .L0f0f0f0f:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) .long 0x0f0f0f0f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) * %rax: temporary storage, 256 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) * %xmm0..%xmm15: 16 plaintext blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) * output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) leaq 8 * 16(%rax), %rcx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) %xmm15, %rax, %rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) %xmm15, %rax, %rcx, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) ((key_table + (8) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) ((key_table + (8) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) ((key_table + (8) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) ((key_table + (8) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) %xmm15, %rax, %rcx, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) ((key_table + (16) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) ((key_table + (16) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) ((key_table + (16) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) ((key_table + (16) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) %xmm15, %rax, %rcx, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) movl $24, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) jne .Lenc_max32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) .Lenc_done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) /* load CD for output */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) vmovdqu 0 * 16(%rcx), %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) vmovdqu 1 * 16(%rcx), %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) vmovdqu 2 * 16(%rcx), %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) vmovdqu 3 * 16(%rcx), %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) vmovdqu 4 * 16(%rcx), %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) vmovdqu 5 * 16(%rcx), %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) vmovdqu 6 * 16(%rcx), %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) vmovdqu 7 * 16(%rcx), %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) .Lenc_max32:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) ((key_table + (24) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) ((key_table + (24) * 8) + 4)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) ((key_table + (24) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) ((key_table + (24) * 8) + 12)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) %xmm15, %rax, %rcx, 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) jmp .Lenc_done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) SYM_FUNC_END(__camellia_enc_blk16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) * %rax: temporary storage, 256 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) * %r8d: 24 for 16 byte key, 32 for larger
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) * %xmm0..%xmm15: 16 encrypted blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) * output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) leaq 8 * 16(%rax), %rcx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) %xmm15, %rax, %rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) cmpl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) je .Ldec_max32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) .Ldec_max24:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) %xmm15, %rax, %rcx, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) ((key_table + (16) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) ((key_table + (16) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) ((key_table + (16) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) ((key_table + (16) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) %xmm15, %rax, %rcx, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) ((key_table + (8) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) ((key_table + (8) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) ((key_table + (8) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) ((key_table + (8) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) %xmm15, %rax, %rcx, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) /* load CD for output */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) vmovdqu 0 * 16(%rcx), %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) vmovdqu 1 * 16(%rcx), %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) vmovdqu 2 * 16(%rcx), %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) vmovdqu 3 * 16(%rcx), %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) vmovdqu 4 * 16(%rcx), %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) vmovdqu 5 * 16(%rcx), %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) vmovdqu 6 * 16(%rcx), %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) vmovdqu 7 * 16(%rcx), %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) .Ldec_max32:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) %xmm15, %rax, %rcx, 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) %xmm15,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) ((key_table + (24) * 8) + 8)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) ((key_table + (24) * 8) + 12)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) ((key_table + (24) * 8) + 0)(CTX),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) ((key_table + (24) * 8) + 4)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) jmp .Ldec_max24;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) SYM_FUNC_END(__camellia_dec_blk16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) SYM_FUNC_START(camellia_ecb_enc_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) * %rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) * %rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) %xmm15, %rdx, (key_table)(CTX));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) /* now dst can be used as temporary buffer (even in src == dst case) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) movq %rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) call __camellia_enc_blk16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) %xmm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) SYM_FUNC_END(camellia_ecb_enc_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) SYM_FUNC_START(camellia_ecb_dec_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) * %rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) * %rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) cmovel %eax, %r8d; /* max */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) %xmm15, %rdx, (key_table)(CTX, %r8, 8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) /* now dst can be used as temporary buffer (even in src == dst case) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) movq %rsi, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) call __camellia_dec_blk16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) %xmm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) SYM_FUNC_END(camellia_ecb_dec_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) SYM_FUNC_START(camellia_cbc_dec_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) * %rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) * %rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) cmovel %eax, %r8d; /* max */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) %xmm15, %rdx, (key_table)(CTX, %r8, 8));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) * dst might still be in-use (in case dst == src), so use stack for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) * temporary storage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) subq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) call __camellia_dec_blk16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) addq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) %xmm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) SYM_FUNC_END(camellia_cbc_dec_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) #define inc_le128(x, minus_one, tmp) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) vpcmpeqq minus_one, x, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) vpsubq minus_one, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) vpslldq $8, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) vpsubq tmp, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) SYM_FUNC_START(camellia_ctr_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) * %rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) * %rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) * %rcx: iv (little endian, 128bit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) subq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) vmovdqa .Lbswap128_mask, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) /* load IV and byteswap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) vmovdqu (%rcx), %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) vpshufb %xmm14, %xmm0, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) vmovdqu %xmm15, 15 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) vpcmpeqd %xmm15, %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) /* construct IVs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) vpshufb %xmm14, %xmm0, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) vmovdqu %xmm13, 14 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) vpshufb %xmm14, %xmm0, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) vmovdqu %xmm13, 13 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) vpshufb %xmm14, %xmm0, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) vpshufb %xmm14, %xmm0, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) vpshufb %xmm14, %xmm0, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) vpshufb %xmm14, %xmm0, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) vpshufb %xmm14, %xmm0, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) vpshufb %xmm14, %xmm0, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) vpshufb %xmm14, %xmm0, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) vpshufb %xmm14, %xmm0, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) vpshufb %xmm14, %xmm0, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) vpshufb %xmm14, %xmm0, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) vpshufb %xmm14, %xmm0, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) vpshufb %xmm14, %xmm0, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) inc_le128(%xmm0, %xmm15, %xmm13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) vmovdqa %xmm0, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) vpshufb %xmm14, %xmm0, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) inc_le128(%xmm13, %xmm15, %xmm14);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) vmovdqu %xmm13, (%rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) /* inpack16_pre: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) vmovq (key_table)(CTX), %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) vpshufb .Lpack_bswap, %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) vpxor %xmm0, %xmm15, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) vpxor %xmm1, %xmm15, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) vpxor %xmm2, %xmm15, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) vpxor %xmm3, %xmm15, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) vpxor %xmm4, %xmm15, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) vpxor %xmm5, %xmm15, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) vpxor %xmm6, %xmm15, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) vpxor %xmm7, %xmm15, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) vpxor %xmm8, %xmm15, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) vpxor %xmm9, %xmm15, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) vpxor %xmm10, %xmm15, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) vpxor %xmm11, %xmm15, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) vpxor %xmm12, %xmm15, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) vpxor 13 * 16(%rax), %xmm15, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) vpxor 14 * 16(%rax), %xmm15, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) vpxor 15 * 16(%rax), %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) call __camellia_enc_blk16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) addq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) vpxor 0 * 16(%rdx), %xmm7, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) vpxor 1 * 16(%rdx), %xmm6, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) vpxor 2 * 16(%rdx), %xmm5, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) vpxor 3 * 16(%rdx), %xmm4, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) vpxor 4 * 16(%rdx), %xmm3, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) vpxor 5 * 16(%rdx), %xmm2, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) vpxor 6 * 16(%rdx), %xmm1, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) vpxor 7 * 16(%rdx), %xmm0, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) vpxor 8 * 16(%rdx), %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) vpxor 9 * 16(%rdx), %xmm14, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) vpxor 10 * 16(%rdx), %xmm13, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) vpxor 11 * 16(%rdx), %xmm12, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) vpxor 12 * 16(%rdx), %xmm11, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) vpxor 13 * 16(%rdx), %xmm10, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) vpxor 14 * 16(%rdx), %xmm9, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) vpxor 15 * 16(%rdx), %xmm8, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) %xmm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) SYM_FUNC_END(camellia_ctr_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) #define gf128mul_x_ble(iv, mask, tmp) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) vpsrad $31, iv, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) vpaddq iv, iv, iv; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) vpshufd $0x13, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) vpand mask, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) vpxor tmp, iv, iv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) .align 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) * %rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) * %rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) * %r8: index for input whitening key
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) subq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) movq %rsp, %rax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) /* load IV */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) vmovdqu (%rcx), %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) vpxor 0 * 16(%rdx), %xmm0, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) vmovdqu %xmm15, 15 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) vmovdqu %xmm0, 0 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) /* construct IVs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) vpxor 1 * 16(%rdx), %xmm0, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) vmovdqu %xmm15, 14 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) vmovdqu %xmm0, 1 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) vpxor 2 * 16(%rdx), %xmm0, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) vmovdqu %xmm0, 2 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) vpxor 3 * 16(%rdx), %xmm0, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) vmovdqu %xmm0, 3 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) vpxor 4 * 16(%rdx), %xmm0, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) vmovdqu %xmm0, 4 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) vpxor 5 * 16(%rdx), %xmm0, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) vmovdqu %xmm0, 5 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) vpxor 6 * 16(%rdx), %xmm0, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) vmovdqu %xmm0, 6 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) vpxor 7 * 16(%rdx), %xmm0, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) vmovdqu %xmm0, 7 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) vpxor 8 * 16(%rdx), %xmm0, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) vmovdqu %xmm0, 8 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) vpxor 9 * 16(%rdx), %xmm0, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) vmovdqu %xmm0, 9 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) vpxor 10 * 16(%rdx), %xmm0, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) vmovdqu %xmm0, 10 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) vpxor 11 * 16(%rdx), %xmm0, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) vmovdqu %xmm0, 11 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) vpxor 12 * 16(%rdx), %xmm0, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) vmovdqu %xmm0, 12 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) vpxor 13 * 16(%rdx), %xmm0, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) vmovdqu %xmm0, 13 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) vpxor 14 * 16(%rdx), %xmm0, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) vmovdqu %xmm0, 14 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) vpxor 15 * 16(%rdx), %xmm0, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) vmovdqu %xmm15, 0 * 16(%rax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) vmovdqu %xmm0, 15 * 16(%rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) vmovdqu %xmm0, (%rcx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) /* inpack16_pre: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) vmovq (key_table)(CTX, %r8, 8), %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) vpshufb .Lpack_bswap, %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) vpxor 0 * 16(%rax), %xmm15, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) vpxor %xmm1, %xmm15, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) vpxor %xmm2, %xmm15, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) vpxor %xmm3, %xmm15, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) vpxor %xmm4, %xmm15, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) vpxor %xmm5, %xmm15, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) vpxor %xmm6, %xmm15, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) vpxor %xmm7, %xmm15, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) vpxor %xmm8, %xmm15, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) vpxor %xmm9, %xmm15, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) vpxor %xmm10, %xmm15, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) vpxor %xmm11, %xmm15, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) vpxor %xmm12, %xmm15, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) vpxor %xmm13, %xmm15, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) vpxor 14 * 16(%rax), %xmm15, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) vpxor 15 * 16(%rax), %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) CALL_NOSPEC r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) addq $(16 * 16), %rsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) vpxor 0 * 16(%rsi), %xmm7, %xmm7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) vpxor 1 * 16(%rsi), %xmm6, %xmm6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) vpxor 2 * 16(%rsi), %xmm5, %xmm5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) vpxor 3 * 16(%rsi), %xmm4, %xmm4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) vpxor 4 * 16(%rsi), %xmm3, %xmm3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) vpxor 5 * 16(%rsi), %xmm2, %xmm2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) vpxor 6 * 16(%rsi), %xmm1, %xmm1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) vpxor 7 * 16(%rsi), %xmm0, %xmm0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) vpxor 8 * 16(%rsi), %xmm15, %xmm15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) vpxor 9 * 16(%rsi), %xmm14, %xmm14;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) vpxor 10 * 16(%rsi), %xmm13, %xmm13;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) vpxor 11 * 16(%rsi), %xmm12, %xmm12;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) vpxor 12 * 16(%rsi), %xmm11, %xmm11;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) vpxor 13 * 16(%rsi), %xmm10, %xmm10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) vpxor 14 * 16(%rsi), %xmm9, %xmm9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) vpxor 15 * 16(%rsi), %xmm8, %xmm8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) %xmm8, %rsi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) SYM_FUNC_END(camellia_xts_crypt_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) SYM_FUNC_START(camellia_xts_enc_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) * %rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) * %rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) xorl %r8d, %r8d; /* input whitening key, 0 for enc */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) leaq __camellia_enc_blk16, %r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) jmp camellia_xts_crypt_16way;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) SYM_FUNC_END(camellia_xts_enc_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) SYM_FUNC_START(camellia_xts_dec_16way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) * %rdi: ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) * %rsi: dst (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) * %rdx: src (16 blocks)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) cmpl $16, key_length(CTX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) movl $32, %r8d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) movl $24, %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) cmovel %eax, %r8d; /* input whitening key, last for dec */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) leaq __camellia_dec_blk16, %r9;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) jmp camellia_xts_crypt_16way;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) SYM_FUNC_END(camellia_xts_dec_16way)