^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * arch/arm64/lib/xor-neon.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Authors: Jackie Liu <liuyun01@kylinos.cn>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/raid/xor.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/module.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <asm/neon-intrinsics.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) unsigned long *p2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) uint64_t *dp1 = (uint64_t *)p1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) uint64_t *dp2 = (uint64_t *)p2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) register uint64x2_t v0, v1, v2, v3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) long lines = bytes / (sizeof(uint64x2_t) * 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) /* p1 ^= p2 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) /* store */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) vst1q_u64(dp1 + 0, v0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) vst1q_u64(dp1 + 2, v1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) vst1q_u64(dp1 + 4, v2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) vst1q_u64(dp1 + 6, v3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) dp1 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) dp2 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) } while (--lines > 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) unsigned long *p2, unsigned long *p3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) uint64_t *dp1 = (uint64_t *)p1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) uint64_t *dp2 = (uint64_t *)p2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) uint64_t *dp3 = (uint64_t *)p3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) register uint64x2_t v0, v1, v2, v3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) long lines = bytes / (sizeof(uint64x2_t) * 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) /* p1 ^= p2 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) /* p1 ^= p3 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) /* store */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) vst1q_u64(dp1 + 0, v0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) vst1q_u64(dp1 + 2, v1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) vst1q_u64(dp1 + 4, v2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) vst1q_u64(dp1 + 6, v3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) dp1 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) dp2 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) dp3 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) } while (--lines > 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) unsigned long *p2, unsigned long *p3, unsigned long *p4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) uint64_t *dp1 = (uint64_t *)p1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) uint64_t *dp2 = (uint64_t *)p2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) uint64_t *dp3 = (uint64_t *)p3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) uint64_t *dp4 = (uint64_t *)p4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) register uint64x2_t v0, v1, v2, v3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) long lines = bytes / (sizeof(uint64x2_t) * 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) /* p1 ^= p2 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) /* p1 ^= p3 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) /* p1 ^= p4 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) /* store */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) vst1q_u64(dp1 + 0, v0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) vst1q_u64(dp1 + 2, v1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) vst1q_u64(dp1 + 4, v2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) vst1q_u64(dp1 + 6, v3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) dp1 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) dp2 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) dp3 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) dp4 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) } while (--lines > 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) unsigned long *p2, unsigned long *p3,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) unsigned long *p4, unsigned long *p5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) uint64_t *dp1 = (uint64_t *)p1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) uint64_t *dp2 = (uint64_t *)p2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) uint64_t *dp3 = (uint64_t *)p3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) uint64_t *dp4 = (uint64_t *)p4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) uint64_t *dp5 = (uint64_t *)p5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) register uint64x2_t v0, v1, v2, v3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) long lines = bytes / (sizeof(uint64x2_t) * 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) /* p1 ^= p2 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) /* p1 ^= p3 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) /* p1 ^= p4 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) /* p1 ^= p5 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) /* store */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) vst1q_u64(dp1 + 0, v0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) vst1q_u64(dp1 + 2, v1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) vst1q_u64(dp1 + 4, v2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) vst1q_u64(dp1 + 6, v3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) dp1 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) dp2 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) dp3 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) dp4 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) dp5 += 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) } while (--lines > 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) struct xor_block_template const xor_block_inner_neon = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) .name = "__inner_neon__",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) .do_2 = xor_arm64_neon_2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) .do_3 = xor_arm64_neon_3,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) .do_4 = xor_arm64_neon_4,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) .do_5 = xor_arm64_neon_5,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) EXPORT_SYMBOL(xor_block_inner_neon);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) MODULE_DESCRIPTION("ARMv8 XOR Extensions");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) MODULE_LICENSE("GPL");