^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* -----------------------------------------------------------------------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2012 Rob Herring
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Based on altivec.uc:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * This program is free software; you can redistribute it and/or modify
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * it under the terms of the GNU General Public License as published by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * Boston MA 02111-1307, USA; either version 2 of the License, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * (at your option) any later version; incorporated herein by reference.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * ----------------------------------------------------------------------- */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * neon$#.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * $#-way unrolled NEON intrinsics math RAID-6 instruction set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * This file is postprocessed using unroll.awk
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <arm_neon.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) typedef uint8x16_t unative_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #define NSIZE sizeof(unative_t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) * The SHLBYTE() operation shifts each byte left by 1, *not*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) * rolling over into the next byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) static inline unative_t SHLBYTE(unative_t v)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) return vshlq_n_u8(v, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * The MASK() operation returns 0xFF in any byte for which the high
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) * bit is 1, 0x00 for any byte for which the high bit is 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) static inline unative_t MASK(unative_t v)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) static inline unative_t PMUL(unative_t v, unative_t u)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) uint8_t **dptr = (uint8_t **)ptrs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) uint8_t *p, *q;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) int d, z, z0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) const unative_t x1d = vdupq_n_u8(0x1d);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) z0 = disks - 3; /* Highest data disk */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) p = dptr[z0+1]; /* XOR parity */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) q = dptr[z0+2]; /* RS syndrome */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) for ( z = z0-1 ; z >= 0 ; z-- ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) wp$$ = veorq_u8(wp$$, wd$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) w2$$ = MASK(wq$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) w1$$ = SHLBYTE(wq$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) w2$$ = vandq_u8(w2$$, x1d);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) w1$$ = veorq_u8(w1$$, w2$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) wq$$ = veorq_u8(w1$$, wd$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) vst1q_u8(&p[d+NSIZE*$$], wp$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) vst1q_u8(&q[d+NSIZE*$$], wq$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) unsigned long bytes, void **ptrs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) uint8_t **dptr = (uint8_t **)ptrs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) uint8_t *p, *q;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) int d, z, z0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) const unative_t x1d = vdupq_n_u8(0x1d);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) z0 = stop; /* P/Q right side optimization */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) p = dptr[disks-2]; /* XOR parity */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) q = dptr[disks-1]; /* RS syndrome */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) /* P/Q data pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) for ( z = z0-1 ; z >= start ; z-- ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) wp$$ = veorq_u8(wp$$, wd$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) w2$$ = MASK(wq$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) w1$$ = SHLBYTE(wq$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) w2$$ = vandq_u8(w2$$, x1d);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) w1$$ = veorq_u8(w1$$, w2$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) wq$$ = veorq_u8(w1$$, wd$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) /* P/Q left side optimization */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) for ( z = start-1 ; z >= 3 ; z -= 4 ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) w2$$ = vshrq_n_u8(wq$$, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) w1$$ = vshlq_n_u8(wq$$, 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) w2$$ = PMUL(w2$$, x1d);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) wq$$ = veorq_u8(w1$$, w2$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) switch (z) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) case 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) w2$$ = vshrq_n_u8(wq$$, 5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) w1$$ = vshlq_n_u8(wq$$, 3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) w2$$ = PMUL(w2$$, x1d);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) wq$$ = veorq_u8(w1$$, w2$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) case 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) w2$$ = vshrq_n_u8(wq$$, 6);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) w1$$ = vshlq_n_u8(wq$$, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) w2$$ = PMUL(w2$$, x1d);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) wq$$ = veorq_u8(w1$$, w2$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) case 0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) w2$$ = MASK(wq$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) w1$$ = SHLBYTE(wq$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) w2$$ = vandq_u8(w2$$, x1d);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) wq$$ = veorq_u8(w1$$, w2$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) wq$$ = veorq_u8(wq$$, w1$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) vst1q_u8(&p[d+NSIZE*$$], wp$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) vst1q_u8(&q[d+NSIZE*$$], wq$$);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) }