^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (C) 2012 Intel Corporation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <arm_neon.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #ifdef CONFIG_ARM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * AArch32 does not provide this intrinsic natively because it does not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * implement the underlying instruction. AArch32 only provides a 64-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * wide vtbl.8 instruction, so use that instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) union {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) uint8x16_t val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) uint8x8x2_t pair;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) } __a = { a };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) vtbl2_u8(__a.pair, vget_high_u8(b)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) uint8_t *dq, const uint8_t *pbmul,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) const uint8_t *qmul)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) uint8x16_t pm0 = vld1q_u8(pbmul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) uint8x16_t pm1 = vld1q_u8(pbmul + 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) uint8x16_t qm0 = vld1q_u8(qmul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) uint8x16_t qm1 = vld1q_u8(qmul + 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) uint8x16_t x0f = vdupq_n_u8(0x0f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) * while ( bytes-- ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * uint8_t px, qx, db;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * px = *p ^ *dp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * qx = qmul[*q ^ *dq];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * *dq++ = db = pbmul[px] ^ qx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) * *dp++ = db ^ px;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * p++; q++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) while (bytes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) uint8x16_t vx, vy, px, qx, db;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) vy = vshrq_n_u8(vx, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) vy = vqtbl1q_u8(qm1, vy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) qx = veorq_u8(vx, vy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) vy = vshrq_n_u8(px, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) vy = vqtbl1q_u8(pm1, vy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) vx = veorq_u8(vx, vy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) db = veorq_u8(vx, qx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) vst1q_u8(dq, db);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) vst1q_u8(dp, veorq_u8(db, px));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) bytes -= 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) p += 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) q += 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) dp += 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) dq += 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) const uint8_t *qmul)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) uint8x16_t qm0 = vld1q_u8(qmul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) uint8x16_t qm1 = vld1q_u8(qmul + 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) uint8x16_t x0f = vdupq_n_u8(0x0f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) * while (bytes--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) * *p++ ^= *dq = qmul[*q ^ *dq];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) * q++; dq++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) * }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) while (bytes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) uint8x16_t vx, vy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) vy = vshrq_n_u8(vx, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) vy = vqtbl1q_u8(qm1, vy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) vx = veorq_u8(vx, vy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) vy = veorq_u8(vx, vld1q_u8(p));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) vst1q_u8(dq, vx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) vst1q_u8(p, vy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) bytes -= 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) p += 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) q += 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) dq += 16;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) }