^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0-or-later */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Based on crypto/serpent.c by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * 2003 Herbert Valerio Riedel <hvr@gnu.org>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) .file "serpent-sse2-i586-asm_32.S"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #define arg_ctx 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #define arg_dst 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #define arg_src 12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #define arg_xor 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) /**********************************************************************
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) 4-way SSE2 serpent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) **********************************************************************/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #define CTX %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #define RA %xmm0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #define RB %xmm1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #define RC %xmm2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #define RD %xmm3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #define RE %xmm4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #define RT0 %xmm5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #define RT1 %xmm6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #define RNOT %xmm7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #define get_key(i, j, t) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) movd (4*(i)+(j))*4(CTX), t; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) pshufd $0, t, t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #define K(x0, x1, x2, x3, x4, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) get_key(i, 0, x4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) get_key(i, 1, RT0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) get_key(i, 2, RT1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) pxor x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) pxor RT0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) pxor RT1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) get_key(i, 3, x4); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) pxor x4, x3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #define LK(x0, x1, x2, x3, x4, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) movdqa x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) pslld $13, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) psrld $(32 - 13), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) por x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) movdqa x2, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) pslld $3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) psrld $(32 - 3), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) por x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) pxor x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) pslld $1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) psrld $(32 - 1), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) por x4, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) movdqa x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) pslld $3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) pxor x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) pxor x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) movdqa x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) pslld $7, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) psrld $(32 - 7), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) por x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) pslld $7, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) pxor x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) pxor x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) pxor x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) movdqa x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) get_key(i, 1, RT0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) pxor RT0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) get_key(i, 3, RT0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) pxor RT0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) pslld $5, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) psrld $(32 - 5), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) por x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) movdqa x2, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) pslld $22, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) psrld $(32 - 22), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) por x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) get_key(i, 0, RT0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) pxor RT0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) get_key(i, 2, RT0); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) pxor RT0, x2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) #define KL(x0, x1, x2, x3, x4, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) K(x0, x1, x2, x3, x4, i); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) movdqa x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) psrld $5, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) pslld $(32 - 5), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) por x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) movdqa x2, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) psrld $22, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) pslld $(32 - 22), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) por x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) pxor x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) pslld $7, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) pxor x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) pxor x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) psrld $1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) pslld $(32 - 1), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) por x4, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) movdqa x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) psrld $7, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) pslld $(32 - 7), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) por x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) movdqa x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) pslld $3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) pxor x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) movdqa x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) psrld $13, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) pslld $(32 - 13), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) por x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) pxor x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) pxor x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) movdqa x2, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) psrld $3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) pslld $(32 - 3), x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) por x4, x2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) #define S0(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) movdqa x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) por x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) pxor x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) pxor x2, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) pxor RNOT, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) pxor x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) pand x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) pxor x4, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) pxor x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) pxor x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) por x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) pand x1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) pxor x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) pxor RNOT, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) pxor x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) pxor x2, x1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) #define S1(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) pxor x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) pxor RNOT, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) pand x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) por x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) pxor x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) pxor x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) pxor x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) pxor x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) por x4, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) pxor x2, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) pand x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) pxor x1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) por x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) pxor RNOT, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) pxor x1, x4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) #define S2(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) pxor RNOT, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) movdqa x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) pand x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) pxor x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) por x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) pxor x1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) pxor x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) pand x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) pand x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) por x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) pxor RNOT, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) pxor x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) pxor x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) por x2, x1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) #define S3(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) pxor x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) por x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) pand x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) pxor x1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) pand x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) por x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) pxor x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) pand x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) pand x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) pxor x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) por x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) pand x1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) pxor x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) pxor x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) pxor x2, x3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) #define S4(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) movdqa x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) pand x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) pxor x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) pxor x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) por x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) pxor x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) pxor x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) por x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) pxor x1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) pand x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) pxor x4, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) pand x2, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) pxor x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) por x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) pxor RNOT, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) pxor x0, x3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) #define S5(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) por x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) pxor x1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) pxor RNOT, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) pxor x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) pand x4, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) por x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) pxor x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) pand x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) pxor x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) pxor x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) pxor x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) pand x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) pxor x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) pand x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) pxor x2, x3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) #define S6(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) pxor x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) pxor x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) pxor x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) pand x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) por x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) pxor RNOT, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) pxor x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) pxor x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) pxor x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) pxor x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) pand x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) pxor x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) pand x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) pxor x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) pxor x2, x1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) #define S7(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) pxor RNOT, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) pxor RNOT, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) pand x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) pxor x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) por x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) pxor x2, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) pxor x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) por x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) pand x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) pxor x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) pxor x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) pand x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) pxor x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) pxor x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) pxor x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) por x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) pxor x1, x4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) #define SI0(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) movdqa x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) por x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) pxor x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) pxor RNOT, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) pxor x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) pand x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) pand x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) pxor x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) pxor x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) pand x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) pxor x3, x4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) #define SI1(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) pxor x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) movdqa x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) pxor RNOT, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) por x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) pxor x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) pand x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) pxor x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) pand x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) pxor x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) por x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) pxor x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) pxor x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) por x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) pxor x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) pxor x1, x4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) #define SI2(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) pxor x1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) movdqa x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) pxor RNOT, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) por x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) pxor x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) pxor x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) pxor x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) por x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) pxor x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) pxor x4, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) por x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) pxor x2, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) pand x1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) pxor x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) pxor x0, x4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) #define SI3(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) pxor x1, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) pand x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) por x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) pxor x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) pxor x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) por x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) pxor x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) pxor x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) pand x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) pand x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) pxor x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) pxor x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) pxor x1, x0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) #define SI4(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) movdqa x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) pand x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) por x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) pxor RNOT, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) pxor x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) pand x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) pxor x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) por x4, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) pxor x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) pand x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) pxor x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) pxor x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) pand x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) pxor x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) pxor x3, x0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) #define SI5(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) movdqa x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) por x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) pxor x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) pxor x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) pand x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) por x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) pxor RNOT, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) pxor x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) por x0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) pxor x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) pxor x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) pand x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) pxor x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) pxor x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) pand x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) pxor x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) pxor x3, x4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) #define SI6(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) movdqa x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) pand x3, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) pxor x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) por x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) pxor x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) pand x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) pxor RNOT, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) pxor x1, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) pand x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) pxor x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) pxor x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) pxor x2, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) pxor x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) pxor x0, x2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) #define SI7(x0, x1, x2, x3, x4) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) movdqa x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) pand x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) pxor x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) por x4, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) pxor x1, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) pxor RNOT, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) por x3, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) pxor x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) pand x2, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) pxor x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) pand x2, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) pxor x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) pxor x3, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) pand x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) por x0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) pxor x4, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) pxor x4, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) pand x0, x4; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) pxor x2, x4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) movdqa x0, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) punpckldq x1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) punpckhdq x1, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) movdqa x2, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) punpckhdq x3, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) punpckldq x3, t1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) movdqa x0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) punpcklqdq t1, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) punpckhqdq t1, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) movdqa t2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) punpcklqdq x2, t2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) punpckhqdq x2, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) movdqa t2, x2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) movdqu (0*4*4)(in), x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) movdqu (1*4*4)(in), x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) movdqu (2*4*4)(in), x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) movdqu (3*4*4)(in), x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) movdqu x0, (0*4*4)(out); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) movdqu x1, (1*4*4)(out); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) movdqu x2, (2*4*4)(out); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) movdqu x3, (3*4*4)(out);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) movdqu (0*4*4)(out), t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) pxor t0, x0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) movdqu x0, (0*4*4)(out); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) movdqu (1*4*4)(out), t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) pxor t0, x1; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) movdqu x1, (1*4*4)(out); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) movdqu (2*4*4)(out), t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) pxor t0, x2; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) movdqu x2, (2*4*4)(out); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) movdqu (3*4*4)(out), t0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) pxor t0, x3; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) movdqu x3, (3*4*4)(out);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) SYM_FUNC_START(__serpent_enc_blk_4way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) * arg_ctx(%esp): ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) * arg_dst(%esp): dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) * arg_src(%esp): src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) * arg_xor(%esp): bool, if true: xor output
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) pcmpeqd RNOT, RNOT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) movl arg_ctx(%esp), CTX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) movl arg_src(%esp), %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) K(RA, RB, RC, RD, RE, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) movl arg_dst(%esp), %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) cmpb $0, arg_xor(%esp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) jnz .L__enc_xor4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) .L__enc_xor4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) SYM_FUNC_END(__serpent_enc_blk_4way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) SYM_FUNC_START(serpent_dec_blk_4way)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) /* input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) * arg_ctx(%esp): ctx, CTX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) * arg_dst(%esp): dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) * arg_src(%esp): src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) pcmpeqd RNOT, RNOT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) movl arg_ctx(%esp), CTX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) movl arg_src(%esp), %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) K(RA, RB, RC, RD, RE, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) movl arg_dst(%esp), %eax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) SYM_FUNC_END(serpent_dec_blk_4way)