^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0-only */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (C) 2013 ARM Ltd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright (C) 2013 Linaro.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * This code is based on glibc cortex strings work originally authored by Linaro
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * be found @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * files/head:/src/aarch64/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <asm/assembler.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <asm/cache.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * Move a buffer from src to test (alignment handled by the hardware).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * If dest <= src, call memcpy, otherwise copy in reverse order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * Parameters:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * x0 - dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * x1 - src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * x2 - n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * Returns:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * x0 - dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) dstin .req x0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) src .req x1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) count .req x2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) tmp1 .req x3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) tmp1w .req w3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) tmp2 .req x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) tmp2w .req w4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) tmp3 .req x5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) tmp3w .req w5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) dst .req x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) A_l .req x7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) A_h .req x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) B_l .req x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) B_h .req x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) C_l .req x11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) C_h .req x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) D_l .req x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) D_h .req x14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) SYM_FUNC_START_ALIAS(__memmove)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) SYM_FUNC_START_WEAK_PI(memmove)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) cmp dstin, src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) b.lo __memcpy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) add tmp1, src, count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) cmp dstin, tmp1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) b.hs __memcpy /* No overlap. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) add dst, dstin, count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) add src, src, count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) cmp count, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) b.lo .Ltail15 /*probably non-alignment accesses.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) ands tmp2, src, #15 /* Bytes to reach alignment. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) b.eq .LSrcAligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) sub count, count, tmp2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * process the aligned offset length to make the src aligned firstly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * those extra instructions' cost is acceptable. It also make the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * coming accesses are based on aligned address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) tbz tmp2, #0, 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) ldrb tmp1w, [src, #-1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) strb tmp1w, [dst, #-1]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) tbz tmp2, #1, 2f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) ldrh tmp1w, [src, #-2]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) strh tmp1w, [dst, #-2]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) tbz tmp2, #2, 3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) ldr tmp1w, [src, #-4]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) str tmp1w, [dst, #-4]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) 3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) tbz tmp2, #3, .LSrcAligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) ldr tmp1, [src, #-8]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) str tmp1, [dst, #-8]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) .LSrcAligned:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) cmp count, #64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) b.ge .Lcpy_over64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * Deal with small copies quickly by dropping straight into the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * exit block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) .Ltail63:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) * Copy up to 48 bytes of data. At this point we only need the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) * bottom 6 bits of count to be accurate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) ands tmp1, count, #0x30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) b.eq .Ltail15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) cmp tmp1w, #0x20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) b.eq 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) b.lt 2f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) ldp A_l, A_h, [src, #-16]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) stp A_l, A_h, [dst, #-16]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) ldp A_l, A_h, [src, #-16]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) stp A_l, A_h, [dst, #-16]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) ldp A_l, A_h, [src, #-16]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) stp A_l, A_h, [dst, #-16]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) .Ltail15:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) tbz count, #3, 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) ldr tmp1, [src, #-8]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) str tmp1, [dst, #-8]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) tbz count, #2, 2f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) ldr tmp1w, [src, #-4]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) str tmp1w, [dst, #-4]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) tbz count, #1, 3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) ldrh tmp1w, [src, #-2]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) strh tmp1w, [dst, #-2]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) 3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) tbz count, #0, .Lexitfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) ldrb tmp1w, [src, #-1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) strb tmp1w, [dst, #-1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) .Lexitfunc:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) .Lcpy_over64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) subs count, count, #128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) b.ge .Lcpy_body_large
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) * Less than 128 bytes to copy, so handle 64 bytes here and then jump
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) * to the tail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) ldp A_l, A_h, [src, #-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) stp A_l, A_h, [dst, #-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) ldp B_l, B_h, [src, #-32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) ldp C_l, C_h, [src, #-48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) stp B_l, B_h, [dst, #-32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) stp C_l, C_h, [dst, #-48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) ldp D_l, D_h, [src, #-64]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) stp D_l, D_h, [dst, #-64]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) tst count, #0x3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) b.ne .Ltail63
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) * Critical loop. Start at a new cache line boundary. Assuming
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) * 64 bytes per line this ensures the entire loop is in one line.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) .p2align L1_CACHE_SHIFT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) .Lcpy_body_large:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) /* pre-load 64 bytes data. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) ldp A_l, A_h, [src, #-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) ldp B_l, B_h, [src, #-32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) ldp C_l, C_h, [src, #-48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) ldp D_l, D_h, [src, #-64]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) * interlace the load of next 64 bytes data block with store of the last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) * loaded 64 bytes data.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) stp A_l, A_h, [dst, #-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) ldp A_l, A_h, [src, #-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) stp B_l, B_h, [dst, #-32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) ldp B_l, B_h, [src, #-32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) stp C_l, C_h, [dst, #-48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) ldp C_l, C_h, [src, #-48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) stp D_l, D_h, [dst, #-64]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) ldp D_l, D_h, [src, #-64]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) subs count, count, #64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) b.ge 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) stp A_l, A_h, [dst, #-16]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) stp B_l, B_h, [dst, #-32]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) stp C_l, C_h, [dst, #-48]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) stp D_l, D_h, [dst, #-64]!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) tst count, #0x3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) b.ne .Ltail63
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) SYM_FUNC_END_PI(memmove)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) EXPORT_SYMBOL(memmove)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) SYM_FUNC_END_ALIAS(__memmove)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) EXPORT_SYMBOL(__memmove)