^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0-or-later */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Shared glue code for 128bit block ciphers, AVX assembler macros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) vmovdqu (0*16)(src), x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) vmovdqu (1*16)(src), x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) vmovdqu (2*16)(src), x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) vmovdqu (3*16)(src), x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) vmovdqu (4*16)(src), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) vmovdqu (5*16)(src), x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) vmovdqu (6*16)(src), x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) vmovdqu (7*16)(src), x7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) vmovdqu x0, (0*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) vmovdqu x1, (1*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) vmovdqu x2, (2*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) vmovdqu x3, (3*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) vmovdqu x4, (4*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) vmovdqu x5, (5*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) vmovdqu x6, (6*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) vmovdqu x7, (7*16)(dst);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) vpxor (0*16)(src), x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) vpxor (1*16)(src), x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) vpxor (2*16)(src), x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) vpxor (3*16)(src), x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) vpxor (4*16)(src), x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) vpxor (5*16)(src), x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) vpxor (6*16)(src), x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #define inc_le128(x, minus_one, tmp) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) vpcmpeqq minus_one, x, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) vpsubq minus_one, x, x; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) vpslldq $8, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) vpsubq tmp, x, x;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) vpcmpeqd t0, t0, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) vmovdqa bswap, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) /* load IV and byteswap */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) vmovdqu (iv), x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) vpshufb t1, x7, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) /* construct IVs */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) inc_le128(x7, t0, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) vpshufb t1, x7, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) inc_le128(x7, t0, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) vpshufb t1, x7, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) inc_le128(x7, t0, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) vpshufb t1, x7, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) inc_le128(x7, t0, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) vpshufb t1, x7, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) inc_le128(x7, t0, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) vpshufb t1, x7, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) inc_le128(x7, t0, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) vpshufb t1, x7, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) inc_le128(x7, t0, t2); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) vmovdqa x7, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) vpshufb t1, x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) inc_le128(t2, t0, t1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) vmovdqu t2, (iv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) #define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) vpxor (0*16)(src), x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) vpxor (1*16)(src), x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) vpxor (2*16)(src), x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) vpxor (3*16)(src), x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) vpxor (4*16)(src), x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) vpxor (5*16)(src), x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) vpxor (6*16)(src), x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) vpxor (7*16)(src), x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) #define gf128mul_x_ble(iv, mask, tmp) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) vpsrad $31, iv, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) vpaddq iv, iv, iv; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) vpshufd $0x13, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) vpand mask, tmp, tmp; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) vpxor tmp, iv, iv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) #define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) t1, xts_gf128mul_and_shl1_mask) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) vmovdqa xts_gf128mul_and_shl1_mask, t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) /* load IV */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) vmovdqu (iv), tiv; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) vpxor (0*16)(src), tiv, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) vmovdqu tiv, (0*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) /* construct and store IVs, also xor with source */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) gf128mul_x_ble(tiv, t0, t1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) vpxor (1*16)(src), tiv, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) vmovdqu tiv, (1*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) gf128mul_x_ble(tiv, t0, t1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) vpxor (2*16)(src), tiv, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) vmovdqu tiv, (2*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) gf128mul_x_ble(tiv, t0, t1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) vpxor (3*16)(src), tiv, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) vmovdqu tiv, (3*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) gf128mul_x_ble(tiv, t0, t1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) vpxor (4*16)(src), tiv, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) vmovdqu tiv, (4*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) gf128mul_x_ble(tiv, t0, t1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) vpxor (5*16)(src), tiv, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) vmovdqu tiv, (5*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) gf128mul_x_ble(tiv, t0, t1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) vpxor (6*16)(src), tiv, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) vmovdqu tiv, (6*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) gf128mul_x_ble(tiv, t0, t1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) vpxor (7*16)(src), tiv, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) vmovdqu tiv, (7*16)(dst); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) gf128mul_x_ble(tiv, t0, t1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) vmovdqu tiv, (iv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) #define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) vpxor (0*16)(dst), x0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) vpxor (1*16)(dst), x1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) vpxor (2*16)(dst), x2, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) vpxor (3*16)(dst), x3, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) vpxor (4*16)(dst), x4, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) vpxor (5*16)(dst), x5, x5; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) vpxor (6*16)(dst), x6, x6; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) vpxor (7*16)(dst), x7, x7; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);