^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0-only */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (C) 2013 ARM Ltd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright (C) 2013 Linaro.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * This code is based on glibc cortex strings work originally authored by Linaro
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * be found @
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * files/head:/src/aarch64/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * Copy a buffer from src to dest (alignment handled by the hardware)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * Parameters:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * x0 - dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * x1 - src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * x2 - n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * Returns:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * x0 - dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) dstin .req x0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) src .req x1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) count .req x2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) tmp1 .req x3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) tmp1w .req w3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) tmp2 .req x4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) tmp2w .req w4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) dst .req x6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) A_l .req x7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) A_h .req x8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) B_l .req x9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) B_h .req x10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) C_l .req x11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) C_h .req x12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) D_l .req x13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) D_h .req x14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) mov dst, dstin
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) cmp count, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) /*When memory length is less than 16, the accessed are not aligned.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) b.lo .Ltiny15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) neg tmp2, src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) ands tmp2, tmp2, #15/* Bytes to reach alignment. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) b.eq .LSrcAligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) sub count, count, tmp2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * Copy the leading memory data from src to dst in an increasing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) * address order.By this way,the risk of overwriting the source
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * memory data is eliminated when the distance between src and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * dst is less than 16. The memory accesses here are alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) tbz tmp2, #0, 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) ldrb1 tmp1w, src, #1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) strb1 tmp1w, dst, #1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) tbz tmp2, #1, 2f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) ldrh1 tmp1w, src, #2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) strh1 tmp1w, dst, #2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) tbz tmp2, #2, 3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) ldr1 tmp1w, src, #4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) str1 tmp1w, dst, #4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) 3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) tbz tmp2, #3, .LSrcAligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) ldr1 tmp1, src, #8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) str1 tmp1, dst, #8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) .LSrcAligned:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) cmp count, #64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) b.ge .Lcpy_over64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * Deal with small copies quickly by dropping straight into the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * exit block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) .Ltail63:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) * Copy up to 48 bytes of data. At this point we only need the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) * bottom 6 bits of count to be accurate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) ands tmp1, count, #0x30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) b.eq .Ltiny15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) cmp tmp1w, #0x20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) b.eq 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) b.lt 2f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) ldp1 A_l, A_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) stp1 A_l, A_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) ldp1 A_l, A_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) stp1 A_l, A_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) ldp1 A_l, A_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) stp1 A_l, A_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) .Ltiny15:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) * Prefer to break one ldp/stp into several load/store to access
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) * memory in an increasing address order,rather than to load/store 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) * bytes from (src-16) to (dst-16) and to backward the src to aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) * address,which way is used in original cortex memcpy. If keeping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) * the original memcpy process here, memmove need to satisfy the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) * precondition that src address is at least 16 bytes bigger than dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * address,otherwise some source data will be overwritten when memove
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * call memcpy directly. To make memmove simpler and decouple the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) * memcpy's dependency on memmove, withdrew the original process.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) tbz count, #3, 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) ldr1 tmp1, src, #8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) str1 tmp1, dst, #8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) tbz count, #2, 2f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) ldr1 tmp1w, src, #4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) str1 tmp1w, dst, #4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) tbz count, #1, 3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) ldrh1 tmp1w, src, #2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) strh1 tmp1w, dst, #2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) 3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) tbz count, #0, .Lexitfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) ldrb1 tmp1w, src, #1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) strb1 tmp1w, dst, #1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) b .Lexitfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) .Lcpy_over64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) subs count, count, #128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) b.ge .Lcpy_body_large
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) * Less than 128 bytes to copy, so handle 64 here and then jump
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) * to the tail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) ldp1 A_l, A_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) stp1 A_l, A_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) ldp1 B_l, B_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) ldp1 C_l, C_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) stp1 B_l, B_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) stp1 C_l, C_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) ldp1 D_l, D_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) stp1 D_l, D_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) tst count, #0x3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) b.ne .Ltail63
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) b .Lexitfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) * Critical loop. Start at a new cache line boundary. Assuming
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) * 64 bytes per line this ensures the entire loop is in one line.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) .p2align L1_CACHE_SHIFT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) .Lcpy_body_large:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) /* pre-get 64 bytes data. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) ldp1 A_l, A_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) ldp1 B_l, B_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) ldp1 C_l, C_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) ldp1 D_l, D_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) * interlace the load of next 64 bytes data block with store of the last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) * loaded 64 bytes data.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) stp1 A_l, A_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) ldp1 A_l, A_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) stp1 B_l, B_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) ldp1 B_l, B_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) stp1 C_l, C_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) ldp1 C_l, C_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) stp1 D_l, D_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) ldp1 D_l, D_h, src, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) subs count, count, #64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) b.ge 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) stp1 A_l, A_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) stp1 B_l, B_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) stp1 C_l, C_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) stp1 D_l, D_h, dst, #16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) tst count, #0x3f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) b.ne .Ltail63
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) .Lexitfunc: