^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * "memcpy" implementation of SuperH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 1999 Niibe Yutaka
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Copyright (c) 2002 STMicroelectronics Ltd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Modified from memcpy.S and micro-optimised for SH4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Stuart Menefy (stuart.menefy@st.com)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * void *memcpy(void *dst, const void *src, size_t n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * It is assumed that there is no overlap between src and dst.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * If there is an overlap, then the results are undefined.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) ! Size is 16 or greater, and may have trailing bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) .balign 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) .Lcase1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) ! Read a long word and write a long word at once
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) ! At the start of each iteration, r7 contains last long load
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) add #-1,r5 ! 79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) mov r4,r2 ! 5 MT (0 cycles latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) add #-4,r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) add #7,r2 ! 79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #ifdef CONFIG_CPU_LITTLE_ENDIAN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) ! 6 cycles, 4 bytes per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) mov r7, r3 ! 5 MT (latency=0) ! RQPO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) cmp/hi r2,r0 ! 57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) shll16 r3 ! 103 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) mov r1,r6 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) shll8 r3 ! 102 EX ! Oxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) shlr8 r6 ! 106 EX ! xNML
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) mov r1, r7 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) or r6,r3 ! 82 EX ! ONML
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) bt/s 3b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) mov.l r3,@-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) mov r7,r3 ! 5 MT (latency=0) ! OPQR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) cmp/hi r2,r0 ! 57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) shlr16 r3 ! 107 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) shlr8 r3 ! 106 EX ! xxxO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) mov r1,r6 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) shll8 r6 ! 102 EX ! LMNx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) mov r1,r7 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) or r6,r3 ! 82 EX ! LMNO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) bt/s 3b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) mov.l r3,@-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) ! Finally, copy a byte at once, if necessary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) add #4,r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) cmp/eq r4,r0 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) add #-6,r2 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) bt 9f ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) 8: cmp/hi r2,r0 ! 57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) mov.b @(r0,r5),r1 ! 20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) bt/s 8b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) mov.b r1,@-r0 ! 29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) 9: rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) ! Size is 16 or greater, and may have trailing bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) .balign 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) .Lcase3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) ! Read a long word and write a long word at once
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) ! At the start of each iteration, r7 contains last long load
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) add #-3,r5 ! 79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) mov r4,r2 ! 5 MT (0 cycles latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) add #-4,r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) add #7,r2 ! 79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) #ifdef CONFIG_CPU_LITTLE_ENDIAN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) ! 6 cycles, 4 bytes per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) mov r7, r3 ! 5 MT (latency=0) ! RQPO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) cmp/hi r2,r0 ! 57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) shll8 r3 ! 102 EX ! QPOx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) mov r1,r6 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) shlr16 r6 ! 107 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) shlr8 r6 ! 106 EX ! xxxN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) mov r1, r7 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) or r6,r3 ! 82 EX ! QPON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) bt/s 3b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) mov.l r3,@-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) 3: mov r7,r3 ! OPQR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) shlr8 r3 ! xOPQ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) mov.l @(r0,r5),r7 ! KLMN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) mov r7,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) shll16 r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) shll8 r6 ! Nxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) or r6,r3 ! NOPQ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) cmp/hi r2,r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) bt/s 3b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) mov.l r3,@-r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) ! Finally, copy a byte at once, if necessary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) add #6,r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) cmp/eq r4,r0 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) add #-6,r2 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) bt 9f ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 8: cmp/hi r2,r0 ! 57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) mov.b @(r0,r5),r1 ! 20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) bt/s 8b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) mov.b r1,@-r0 ! 29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) 9: rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) ENTRY(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) ! Calculate the invariants which will be used in the remainder
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) ! of the code:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) ! r4 --> [ ... ] DST [ ... ] SRC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) ! [ ... ] [ ... ]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) ! : :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) ! r0 --> [ ... ] r0+r5 --> [ ... ]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) ! Short circuit the common case of src, dst and len being 32 bit aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) ! and test for zero length move
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) mov r6, r0 ! 5 MT (0 cycle latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) or r4, r0 ! 82 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) or r5, r0 ! 82 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) tst r6, r6 ! 86 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) bt/s 99f ! 111 BR (zero len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) tst #3, r0 ! 87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) mov r4, r0 ! 5 MT (0 cycle latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) add r6, r0 ! 49 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) mov #16, r1 ! 6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) bt/s .Lcase00 ! 111 BR (aligned)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) sub r4, r5 ! 75 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) ! Arguments are not nicely long word aligned or zero len.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) ! Check for small copies, and if so do a simple byte at a time copy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) ! Deciding on an exact value of 'small' is not easy, as the point at which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) ! using the optimised routines become worthwhile varies (these are the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) ! size byte-at-time long word byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) ! 16 42 39-40 46-50 50-55
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) ! 24 58 43-44 54-58 62-67
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) ! 36 82 49-50 66-70 80-85
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) ! However the penalty for getting it 'wrong' is much higher for long word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) ! aligned data (and this is more common), so use a value of 16.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) cmp/gt r6,r1 ! 56 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) add #-1,r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) bf/s 6f ! 108 BR (not small)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) mov r5, r3 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) shlr r6 ! 104 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) mov.b @(r0,r5),r1 ! 20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) bf/s 4f ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) add #-1,r3 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) tst r6, r6 ! 86 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) bt/s 98f ! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) mov.b r1,@-r0 ! 29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) ! 4 cycles, 2 bytes per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) 3: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) 4: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) dt r6 ! 67 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) mov.b r1,@-r0 ! 29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) bf/s 3b ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) mov.b r2,@-r0 ! 29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 98:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) 99: rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) mov r4, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) ! Size is not small, so its worthwhile looking for optimisations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) ! First align destination to a long word boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) ! r5 = normal value -1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) 6: tst #3, r0 ! 87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) mov #3, r3 ! 6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) bt/s 2f ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) and r0,r3 ! 78 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) ! 3 cycles, 1 byte per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) 1: dt r3 ! 67 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) mov.b @(r0,r5),r1 ! 19 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) add #-1, r6 ! 79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) bf/s 1b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) mov.b r1,@-r0 ! 28 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) 2: add #1, r5 ! 79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) ! Now select the appropriate bulk transfer code based on relative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) ! alignment of src and dst.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) mov r0, r3 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) mov r5, r0 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) tst #1, r0 ! 87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) bf/s 1f ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) mov #64, r7 ! 6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) ! bit 0 clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) cmp/ge r7, r6 ! 55 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) bt/s 2f ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) tst #2, r0 ! 87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) ! small
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) bt/s .Lcase0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) mov r3, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) bra .Lcase2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) ! big
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) 2: bt/s .Lcase0b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) mov r3, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) bra .Lcase2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) ! bit 0 set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 1: tst #2, r0 ! 87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) bt/s .Lcase1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) mov r3, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) bra .Lcase3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) ! src, dst and size are all long word aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) ! size is non-zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) .balign 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) .Lcase00:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) mov #64, r1 ! 6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) mov r5, r3 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) cmp/gt r6, r1 ! 56 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) add #-4, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) bf .Lcase00b ! 108 BR (big loop)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) shlr2 r6 ! 105 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) shlr r6 ! 104 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) mov.l @(r0, r5), r1 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) bf/s 4f ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) add #-8, r3 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) tst r6, r6 ! 86 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) bt/s 5f ! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) mov.l r1,@-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) ! 4 cycles, 2 long words per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) dt r6 ! 67 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) mov.l r1, @-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) bf/s 3b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) mov.l r2, @-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) 5: rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) ! Size is 16 or greater and less than 64, but may have trailing bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) .balign 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) .Lcase0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) add #-4, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) mov r4, r7 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) mov.l @(r0, r5), r1 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) mov #4, r2 ! 6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) add #11, r7 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) tst r2, r6 ! 86 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) mov r5, r3 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) bt/s 4f ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) add #-4, r3 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) mov.l r1,@-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) ! 4 cycles, 2 long words per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) cmp/hi r7, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) mov.l r1, @-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) bt/s 3b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) mov.l r2, @-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) ! Copy the final 0-3 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) add #3,r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) cmp/eq r0, r4 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) add #-10, r7 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) bt 9f ! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) ! 3 cycles, 1 byte per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) 1: mov.b @(r0,r5),r1 ! 19 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) cmp/hi r7,r0 ! 57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) bt/s 1b ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) mov.b r1,@-r0 ! 28 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) 9: rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) ! Size is at least 64 bytes, so will be going round the big loop at least once.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) ! r2 = rounded up r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) ! r3 = rounded down r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) .balign 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) .Lcase0b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) add #-4, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) .Lcase00b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) mov r0, r3 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) mov #(~0x1f), r1 ! 6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) and r1, r3 ! 78 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) mov r4, r2 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) cmp/eq r3, r0 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) add #0x1f, r2 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) bt/s 1f ! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) and r1, r2 ! 78 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) ! copy initial words until cache line aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) mov.l @(r0, r5), r1 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) tst #4, r0 ! 87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) mov r5, r6 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) add #-4, r6 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) bt/s 4f ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) add #8, r3 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) tst #0x18, r0 ! 87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) bt/s 1f ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) mov.l r1,@-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) ! 4 cycles, 2 long words per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) 4: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) cmp/eq r3, r0 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) mov.l r1, @-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) bf/s 3b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) mov.l r7, @-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) ! Copy the cache line aligned blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) ! In use: r0, r2, r4, r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) ! Scratch: r1, r3, r6, r7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) ! We could do this with the four scratch registers, but if src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) ! and dest hit the same cache line, this will thrash, so make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) ! use of additional registers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) ! r5: src (was r0+r5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) ! r1: dest (was r0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) ! this can be reversed at the end, so we don't need to save any extra
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) ! state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 1: mov.l r8, @-r15 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) add r0, r5 ! 49 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) mov.l r9, @-r15 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) mov r0, r1 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) mov.l r10, @-r15 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) add #-0x1c, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) mov.l r11, @-r15 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) ! 16 cycles, 32 bytes per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) 2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) add #-0x20, r1 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) movca.l r0,@r1 ! 40 LS (latency=3-7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) mov.l r3,@(0x04,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) mov.l r6,@(0x08,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) mov.l r7,@(0x0c,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) mov.l r8,@(0x10,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) add #-0x20, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) mov.l r9,@(0x14,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) cmp/eq r2,r1 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) mov.l r10,@(0x18,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) bf/s 2b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) mov.l r11,@(0x1c,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) mov r1, r0 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) mov.l @r15+, r11 ! 15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) sub r1, r5 ! 75 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) mov.l @r15+, r10 ! 15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) cmp/eq r4, r0 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) bf/s 1f ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) mov.l @r15+, r9 ! 15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) 1: mov.l @r15+, r8 ! 15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) sub r4, r1 ! 75 EX (len remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) ! number of trailing bytes is non-zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) ! invariants restored (r5 already decremented by 4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) ! also r1=num bytes remaining
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) mov #4, r2 ! 6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) mov r4, r7 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) add #0x1c, r5 ! 50 EX (back to -4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) cmp/hs r2, r1 ! 58 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) bf/s 5f ! 108 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) add #11, r7 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) mov.l @(r0, r5), r6 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) tst r2, r1 ! 86 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) mov r5, r3 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) bt/s 4f ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) add #-4, r3 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) cmp/hs r2, r1 ! 58 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) bt/s 5f ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) mov.l r6,@-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) ! 4 cycles, 2 long words per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) 3: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) cmp/hi r7, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) mov.l r6, @-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) bt/s 3b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) mov.l r2, @-r0 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) ! Copy the final 0-3 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) 5: cmp/eq r0, r4 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) add #-10, r7 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) bt 9f ! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) add #3,r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) ! 3 cycles, 1 byte per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) 1: mov.b @(r0,r5),r1 ! 19 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) cmp/hi r7,r0 ! 57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) bt/s 1b ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) mov.b r1,@-r0 ! 28 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) 9: rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) .balign 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) .Lcase2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) ! Size is 16 or greater and less then 64, but may have trailing bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) 2: mov r5, r6 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) add #-2,r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) mov r4,r2 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) add #-4,r6 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) add #7,r2 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) 3: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) mov.w @(r0,r6),r3 ! 20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) cmp/hi r2,r0 ! 57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) mov.w r1,@-r0 ! 29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) bt/s 3b ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) mov.w r3,@-r0 ! 29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) bra 10f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) .balign 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) .Lcase2b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) ! Size is at least 64 bytes, so will be going round the big loop at least once.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) ! r2 = rounded up r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) ! r3 = rounded down r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) mov r0, r3 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) mov #(~0x1f), r1 ! 6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) and r1, r3 ! 78 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) mov r4, r2 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) cmp/eq r3, r0 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) add #0x1f, r2 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) add #-2, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) bt/s 1f ! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) and r1, r2 ! 78 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) ! Copy a short word one at a time until we are cache line aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) ! Normal values: r0, r2, r3, r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) ! Unused: r1, r6, r7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) ! Mod: r5 (=r5-2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) add #2, r3 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) 2: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) cmp/eq r3,r0 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) bf/s 2b ! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) mov.w r1,@-r0 ! 29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) ! Copy the cache line aligned blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) ! In use: r0, r2, r4, r5 (=r5-2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) ! Scratch: r1, r3, r6, r7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) ! We could do this with the four scratch registers, but if src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) ! and dest hit the same cache line, this will thrash, so make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) ! use of additional registers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) ! r5: src (was r0+r5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) ! r1: dest (was r0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) ! this can be reversed at the end, so we don't need to save any extra
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) ! state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) 1: mov.l r8, @-r15 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) add r0, r5 ! 49 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) mov.l r9, @-r15 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) mov r0, r1 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) mov.l r10, @-r15 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) add #-0x1e, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) mov.l r11, @-r15 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) mov.l r12, @-r15 ! 30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) ! 17 cycles, 32 bytes per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) #ifdef CONFIG_CPU_LITTLE_ENDIAN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) 2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) add #-0x20, r1 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) shll16 r0 ! 103 EX JI..
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) mov.l @r5+, r7 ! 15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) xtrct r3, r0 ! 48 EX LKJI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) mov.l @r5+, r8 ! 15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) xtrct r6, r3 ! 48 EX PONM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) mov.l @r5+, r9 ! 15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) xtrct r7, r6 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) mov.l @r5+, r10 ! 15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) xtrct r8, r7 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) mov.l @r5+, r11 ! 15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) xtrct r9, r8 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) mov.w @r5+, r12 ! 15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) xtrct r10, r9 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) movca.l r0,@r1 ! 40 LS (latency=3-7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) xtrct r11, r10 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) mov.l r3, @(0x04,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) xtrct r12, r11 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) mov.l r6, @(0x08,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) mov.l r7, @(0x0c,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) mov.l r8, @(0x10,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) add #-0x40, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) mov.l r9, @(0x14,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) cmp/eq r2,r1 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) mov.l r10, @(0x18,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) bf/s 2b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) mov.l r11, @(0x1c,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) 2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) add #-2, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) add #-4, r1 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) shll16 r0 ! 103 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) xtrct r3, r0 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) xtrct r6, r3 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) xtrct r7, r6 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) xtrct r8, r7 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) xtrct r9, r8 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) mov.l @(0x00,r5), r12 ! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) xtrct r10, r9 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) movca.l r0,@r1 ! 40 LS (latency=3-7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) add #-0x1c, r1 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) mov.l r3, @(0x18,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) xtrct r11, r10 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) mov.l r6, @(0x14,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) xtrct r12, r11 ! 48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) mov.l r7, @(0x10,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) mov.l r8, @(0x0c,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) add #-0x1e, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) mov.l r9, @(0x08,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) cmp/eq r2,r1 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) mov.l r10, @(0x04,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) bf/s 2b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) mov.l r11, @(0x00,r1) ! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) mov.l @r15+, r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) mov r1, r0 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) mov.l @r15+, r11 ! 15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) sub r1, r5 ! 75 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) mov.l @r15+, r10 ! 15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) cmp/eq r4, r0 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) bf/s 1f ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) mov.l @r15+, r9 ! 15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) 1: mov.l @r15+, r8 ! 15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) add #0x1e, r5 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) ! Finish off a short word at a time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) ! r5 must be invariant - 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) 10: mov r4,r2 ! 5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) add #1,r2 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) cmp/hi r2, r0 ! 57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) bf/s 1f ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) add #2, r2 ! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) 3: mov.w @(r0,r5),r1 ! 20 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) cmp/hi r2,r0 ! 57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) bt/s 3b ! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) mov.w r1,@-r0 ! 29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) !
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) ! Finally, copy the last byte if necessary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) cmp/eq r4,r0 ! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) bt/s 9b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) add #1,r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) mov.b @(r0,r5),r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) mov.b r1,@-r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800)