^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0-only */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (C) 2013 ARM Ltd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright (C) 2013 Linaro.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * This code is based on glibc cortex strings work originally authored by Linaro
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * be found @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * files/head:/src/aarch64/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <asm/assembler.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <asm/cache.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * Fill in the buffer with character c (alignment handled by the hardware)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * Parameters:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * x0 - buf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * x1 - c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * x2 - n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * Returns:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * x0 - buf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) dstin .req x0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) val .req w1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) count .req x2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) tmp1 .req x3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) tmp1w .req w3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) tmp2 .req x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) tmp2w .req w4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) zva_len_x .req x5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) zva_len .req w5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) zva_bits_x .req x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) A_l .req x7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) A_lw .req w7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) dst .req x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) tmp3w .req w9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) tmp3 .req x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) SYM_FUNC_START_ALIAS(__memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) SYM_FUNC_START_WEAK_PI(memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) mov dst, dstin /* Preserve return value. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) and A_lw, val, #255
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) orr A_lw, A_lw, A_lw, lsl #8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) orr A_lw, A_lw, A_lw, lsl #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) orr A_l, A_l, A_l, lsl #32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) cmp count, #15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) b.hi .Lover16_proc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) /*All store maybe are non-aligned..*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) tbz count, #3, 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) str A_l, [dst], #8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) tbz count, #2, 2f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) str A_lw, [dst], #4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) tbz count, #1, 3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) strh A_lw, [dst], #2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) 3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) tbz count, #0, 4f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) strb A_lw, [dst]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) 4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) .Lover16_proc:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) /*Whether the start address is aligned with 16.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) neg tmp2, dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) ands tmp2, tmp2, #15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) b.eq .Laligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) * The count is not less than 16, we can use stp to store the start 16 bytes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * then adjust the dst aligned with 16.This process will make the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * memory address at alignment boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) stp A_l, A_l, [dst] /*non-aligned store..*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) /*make the dst aligned..*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) sub count, count, tmp2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) add dst, dst, tmp2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) .Laligned:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) cbz A_l, .Lzero_mem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) .Ltail_maybe_long:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) cmp count, #64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) b.ge .Lnot_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) .Ltail63:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) ands tmp1, count, #0x30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) b.eq 3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) cmp tmp1w, #0x20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) b.eq 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) b.lt 2f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) stp A_l, A_l, [dst], #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) stp A_l, A_l, [dst], #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) stp A_l, A_l, [dst], #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) * The last store length is less than 16,use stp to write last 16 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) * It will lead some bytes written twice and the access is non-aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) 3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) ands count, count, #15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) cbz count, 4f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) add dst, dst, count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) 4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * Critical loop. Start at a new cache line boundary. Assuming
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * 64 bytes per line, this ensures the entire loop is in one line.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) .p2align L1_CACHE_SHIFT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) .Lnot_short:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) sub dst, dst, #16/* Pre-bias. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) sub count, count, #64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) stp A_l, A_l, [dst, #16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) stp A_l, A_l, [dst, #32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) stp A_l, A_l, [dst, #48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) stp A_l, A_l, [dst, #64]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) subs count, count, #64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) b.ge 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) tst count, #0x3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) add dst, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) b.ne .Ltail63
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) .Lexitfunc:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) * For zeroing memory, check to see if we can use the ZVA feature to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) * zero entire 'cache' lines.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) .Lzero_mem:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) cmp count, #63
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) b.le .Ltail63
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) * For zeroing small amounts of memory, it's not worth setting up
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) * the line-clear code.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) cmp count, #128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) b.lt .Lnot_short /*count is at least 128 bytes*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) mrs tmp1, dczid_el0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) tbnz tmp1, #4, .Lnot_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) mov tmp3w, #4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) lsl zva_len, tmp3w, zva_len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) ands tmp3w, zva_len, #63
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) * ensure the zva_len is not less than 64.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) * It is not meaningful to use ZVA if the block size is less than 64.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) b.ne .Lnot_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) .Lzero_by_line:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) * Compute how far we need to go to become suitably aligned. We're
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) * already at quad-word alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) cmp count, zva_len_x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) b.lt .Lnot_short /* Not enough to reach alignment. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) sub zva_bits_x, zva_len_x, #1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) neg tmp2, dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) ands tmp2, tmp2, zva_bits_x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) b.eq 2f /* Already aligned. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) /* Not aligned, check that there's enough to copy after alignment.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) sub tmp1, count, tmp2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) * grantee the remain length to be ZVA is bigger than 64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) * avoid to make the 2f's process over mem range.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) cmp tmp1, #64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) b.lt .Lnot_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) * We know that there's at least 64 bytes to zero and that it's safe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) * to overrun by 64 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) mov count, tmp1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) stp A_l, A_l, [dst]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) stp A_l, A_l, [dst, #16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) stp A_l, A_l, [dst, #32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) subs tmp2, tmp2, #64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) stp A_l, A_l, [dst, #48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) add dst, dst, #64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) b.ge 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) /* We've overrun a bit, so adjust dst downwards.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) add dst, dst, tmp2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) sub count, count, zva_len_x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) dc zva, dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) add dst, dst, zva_len_x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) subs count, count, zva_len_x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) b.ge 3b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) ands count, count, zva_bits_x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) b.ne .Ltail_maybe_long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) SYM_FUNC_END_PI(memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) EXPORT_SYMBOL(memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) SYM_FUNC_END_ALIAS(__memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) EXPORT_SYMBOL(__memset)