Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3)  * "memcpy" implementation of SuperH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5)  * Copyright (C) 1999  Niibe Yutaka
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6)  * Copyright (c) 2002  STMicroelectronics Ltd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7)  *   Modified from memcpy.S and micro-optimised for SH4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8)  *   Stuart Menefy (stuart.menefy@st.com)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14)  * void *memcpy(void *dst, const void *src, size_t n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16)  * It is assumed that there is no overlap between src and dst.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17)  * If there is an overlap, then the results are undefined.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) 	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) 	! Size is 16 or greater, and may have trailing bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) 	.balign	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) .Lcase1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) 	! Read a long word and write a long word at once
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) 	! At the start of each iteration, r7 contains last long load
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) 	add	#-1,r5		!  79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) 	mov	r4,r2		!   5 MT (0 cycles latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) 	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) 	add	#-4,r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) 	add	#7,r2		!  79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) #ifdef CONFIG_CPU_LITTLE_ENDIAN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) 	! 6 cycles, 4 bytes per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) 3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) 	mov	r7, r3		!   5 MT (latency=0)	! RQPO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) 	cmp/hi	r2,r0		!  57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) 	shll16	r3		! 103 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) 	mov	r1,r6		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) 	shll8	r3		! 102 EX		! Oxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) 	shlr8	r6		! 106 EX		! xNML
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) 	mov	r1, r7		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) 	or	r6,r3		!  82 EX		! ONML
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) 	bt/s	3b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) 	 mov.l	r3,@-r0		!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) 3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) 	mov	r7,r3		!   5 MT (latency=0)	! OPQR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) 	cmp/hi	r2,r0		!  57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) 	shlr16	r3		! 107 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) 	shlr8	r3		! 106 EX		! xxxO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) 	mov	r1,r6		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) 	shll8	r6		! 102 EX		! LMNx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) 	mov	r1,r7		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69) 	or	r6,r3		!  82 EX		! LMNO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) 	bt/s	3b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) 	 mov.l	r3,@-r0		!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) 	! Finally, copy a byte at once, if necessary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) 	add	#4,r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) 	cmp/eq	r4,r0		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) 	add	#-6,r2		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) 	bt	9f		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) 8:	cmp/hi	r2,r0		!  57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) 	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) 	bt/s	8b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) 	 mov.b	r1,@-r0		!  29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) 9:	rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) 	 nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) 	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) 	! Size is 16 or greater, and may have trailing bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) 	.balign	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) .Lcase3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) 	! Read a long word and write a long word at once
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) 	! At the start of each iteration, r7 contains last long load
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) 	add	#-3,r5		! 79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) 	mov	r4,r2		!  5 MT (0 cycles latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) 	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) 	add	#-4,r5		! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) 	add	#7,r2		!  79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) #ifdef CONFIG_CPU_LITTLE_ENDIAN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) 	! 6 cycles, 4 bytes per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) 	mov	r7, r3		!   5 MT (latency=0)	! RQPO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 	cmp/hi	r2,r0		!  57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) 	shll8	r3		! 102 EX		! QPOx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) 	mov	r1,r6		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) 	shlr16	r6		! 107 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) 	shlr8	r6		! 106 EX		! xxxN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) 	mov	r1, r7		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) 	or	r6,r3		!  82 EX		! QPON
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) 	bt/s	3b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) 	 mov.l	r3,@-r0		!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) 3:	mov	r7,r3		! OPQR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) 	shlr8	r3		! xOPQ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) 	mov.l	@(r0,r5),r7	! KLMN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) 	mov	r7,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) 	shll16	r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) 	shll8	r6		! Nxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) 	or	r6,r3		! NOPQ
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) 	cmp/hi	r2,r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) 	bt/s	3b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) 	 mov.l	r3,@-r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) 	! Finally, copy a byte at once, if necessary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) 	add	#6,r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) 	cmp/eq	r4,r0		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) 	add	#-6,r2		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 	bt	9f		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 8:	cmp/hi	r2,r0		!  57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) 	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) 	bt/s	8b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) 	 mov.b	r1,@-r0		!  29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) 9:	rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) 	 nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) ENTRY(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) 	! Calculate the invariants which will be used in the remainder
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 	! of the code:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) 	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) 	!	         [ ...  ]                 [ ...  ]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) 	!	           :                        :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 	! Short circuit the common case of src, dst and len being 32 bit aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) 	! and test for zero length move
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 	mov	r6, r0		!   5 MT (0 cycle latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) 	or	r4, r0		!  82 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) 	or	r5, r0		!  82 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 	tst	r6, r6		!  86 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) 	bt/s	99f		! 111 BR		(zero len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 	 tst	#3, r0		!  87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) 	mov	r4, r0		!   5 MT (0 cycle latency)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) 	add	r6, r0		!  49 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) 	mov	#16, r1		!   6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) 	bt/s	.Lcase00	! 111 BR		(aligned)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) 	 sub	r4, r5		!  75 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 	! Arguments are not nicely long word aligned or zero len.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 	! Check for small copies, and if so do a simple byte at a time copy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 	! Deciding on an exact value of 'small' is not easy, as the point at which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 	! using the optimised routines become worthwhile varies (these are the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) 	!	size	byte-at-time	long	word	byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 	!	16	42		39-40	46-50	50-55
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) 	!	24	58		43-44	54-58	62-67
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 	!	36	82		49-50	66-70	80-85
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) 	! However the penalty for getting it 'wrong' is much higher for long word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) 	! aligned data (and this is more common), so use a value of 16.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) 	cmp/gt	r6,r1		!  56 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) 	add	#-1,r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 	bf/s	6f		! 108 BR		(not small)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) 	 mov	r5, r3		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) 	shlr	r6		! 104 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) 	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) 	bf/s	4f		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) 	 add	#-1,r3		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) 	tst	r6, r6		!  86 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) 	bt/s	98f		! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) 	 mov.b	r1,@-r0		!  29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) 	! 4 cycles, 2 bytes per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) 3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) 4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) 	dt	r6		!  67 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) 	mov.b	r1,@-r0		!  29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) 	bf/s	3b		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) 	 mov.b	r2,@-r0		!  29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 98:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) 	rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) 	 nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) 99:	rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) 	 mov	r4, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) 	! Size is not small, so its worthwhile looking for optimisations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) 	! First align destination to a long word boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) 	! r5 = normal value -1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) 6:	tst	#3, r0		!  87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245)         mov	#3, r3		!   6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) 	bt/s	2f		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) 	 and	r0,r3		!  78 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) 	! 3 cycles, 1 byte per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) 1:	dt	r3		!  67 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) 	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) 	add	#-1, r6		!  79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) 	bf/s	1b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) 	 mov.b	r1,@-r0		!  28 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) 2:	add	#1, r5		!  79 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) 	! Now select the appropriate bulk transfer code based on relative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) 	! alignment of src and dst.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) 	mov	r0, r3		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) 	mov	r5, r0		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) 	tst	#1, r0		!  87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) 	bf/s	1f		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) 	 mov	#64, r7		!   6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) 	! bit 0 clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) 	cmp/ge	r7, r6		!  55 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) 	bt/s	2f		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) 	 tst	#2, r0		!  87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) 	! small
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) 	bt/s	.Lcase0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) 	 mov	r3, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) 	bra	.Lcase2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) 	 nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 	! big
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) 2:	bt/s	.Lcase0b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) 	 mov	r3, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) 	bra	.Lcase2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) 	 nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) 	! bit 0 set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 1:	tst	#2, r0		! 87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) 	bt/s	.Lcase1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) 	 mov	r3, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) 	bra	.Lcase3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) 	 nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) 	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 	! src, dst and size are all long word aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) 	! size is non-zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) 	.balign	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) .Lcase00:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) 	mov	#64, r1		!   6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) 	mov	r5, r3		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) 	cmp/gt	r6, r1		!  56 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) 	add	#-4, r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) 	bf	.Lcase00b	! 108 BR		(big loop)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) 	shlr2	r6		! 105 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) 	shlr	r6		! 104 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) 	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) 	bf/s	4f		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) 	 add	#-8, r3		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) 	tst	r6, r6		!  86 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) 	bt/s	5f		! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) 	 mov.l	r1,@-r0		!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) 	! 4 cycles, 2 long words per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) 3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) 4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) 	dt	r6		!  67 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) 	mov.l	r1, @-r0	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) 	bf/s	3b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) 	 mov.l	r2, @-r0	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) 5:	rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) 	 nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) 	! Size is 16 or greater and less than 64, but may have trailing bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) 	.balign	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) .Lcase0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) 	add	#-4, r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) 	mov	r4, r7		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) 	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) 	mov	#4, r2		!   6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) 	add	#11, r7		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) 	tst	r2, r6		!  86 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) 	mov	r5, r3		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) 	bt/s	4f		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) 	 add	#-4, r3		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) 	mov.l	r1,@-r0		!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) 	! 4 cycles, 2 long words per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) 3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) 4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) 	cmp/hi	r7, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) 	mov.l	r1, @-r0	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) 	bt/s	3b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) 	 mov.l	r2, @-r0	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) 	! Copy the final 0-3 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) 	add	#3,r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) 	cmp/eq	r0, r4		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) 	add	#-10, r7	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) 	bt	9f		! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) 	! 3 cycles, 1 byte per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) 1:	mov.b	@(r0,r5),r1	!  19 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) 	cmp/hi	r7,r0		!  57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) 	bt/s	1b		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) 	 mov.b	r1,@-r0		!  28 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) 9:	rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) 	 nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) 	! Size is at least 64 bytes, so will be going round the big loop at least once.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) 	!   r2 = rounded up r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) 	!   r3 = rounded down r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) 	.balign	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) .Lcase0b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) 	add	#-4, r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) .Lcase00b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) 	mov	r0, r3		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) 	mov	#(~0x1f), r1	!   6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) 	and	r1, r3		!  78 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) 	mov	r4, r2		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) 	cmp/eq	r3, r0		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) 	add	#0x1f, r2	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) 	bt/s	1f		! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) 	 and	r1, r2		!  78 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) 	! copy initial words until cache line aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) 	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) 	tst	#4, r0		!  87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) 	mov	r5, r6		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) 	add	#-4, r6		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) 	bt/s	4f		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) 	 add	#8, r3		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) 	tst	#0x18, r0	!  87 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) 	bt/s	1f		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) 	 mov.l	r1,@-r0		!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) 	! 4 cycles, 2 long words per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) 3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) 4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) 	cmp/eq	r3, r0		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) 	mov.l	r1, @-r0	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) 	bf/s	3b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) 	 mov.l	r7, @-r0	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) 	! Copy the cache line aligned blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) 	! In use: r0, r2, r4, r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) 	! Scratch: r1, r3, r6, r7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) 	! We could do this with the four scratch registers, but if src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) 	! and dest hit the same cache line, this will thrash, so make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) 	! use of additional registers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) 	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) 	!   r5:	 src (was r0+r5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) 	!   r1:	 dest (was r0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) 	! this can be reversed at the end, so we don't need to save any extra
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) 	! state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 1:	mov.l	r8, @-r15	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) 	add	r0, r5		!  49 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) 	mov.l	r9, @-r15	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) 	mov	r0, r1		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) 	mov.l	r10, @-r15	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) 	add	#-0x1c, r5	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) 	mov.l	r11, @-r15	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) 	! 16 cycles, 32 bytes per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) 2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) 	add	#-0x20, r1	! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) 	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) 	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) 	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) 	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) 	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) 	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) 	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) 	movca.l	r0,@r1		! 40 LS (latency=3-7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) 	mov.l	r3,@(0x04,r1)	! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) 	mov.l	r6,@(0x08,r1)	! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) 	mov.l	r7,@(0x0c,r1)	! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) 	mov.l	r8,@(0x10,r1)	! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) 	add	#-0x20, r5	! 50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) 	mov.l	r9,@(0x14,r1)	! 33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) 	cmp/eq	r2,r1		! 54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) 	mov.l	r10,@(0x18,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) 	bf/s	2b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) 	 mov.l	r11,@(0x1c,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) 	mov	r1, r0		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) 	mov.l	@r15+, r11	!  15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) 	sub	r1, r5		!  75 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) 	mov.l	@r15+, r10	!  15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) 	cmp/eq	r4, r0		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) 	bf/s	1f		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) 	 mov.l	 @r15+, r9	!  15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) 	rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) 1:	 mov.l	@r15+, r8	!  15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) 	sub	r4, r1		!  75 EX		(len remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) 	! number of trailing bytes is non-zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) 	! invariants restored (r5 already decremented by 4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) 	! also r1=num bytes remaining
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) 	mov	#4, r2		!   6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) 	mov	r4, r7		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) 	add	#0x1c, r5	!  50 EX		(back to -4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) 	cmp/hs	r2, r1		!  58 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) 	bf/s	5f		! 108 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) 	 add	 #11, r7	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) 	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) 	tst	r2, r1		!  86 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) 	mov	r5, r3		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) 	bt/s	4f		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) 	 add	#-4, r3		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) 	cmp/hs	r2, r1		!  58 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) 	bt/s	5f		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) 	 mov.l	r6,@-r0		!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) 	! 4 cycles, 2 long words per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) 3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) 4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) 	cmp/hi	r7, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) 	mov.l	r6, @-r0	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) 	bt/s	3b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) 	 mov.l	r2, @-r0	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) 	! Copy the final 0-3 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) 5:	cmp/eq	r0, r4		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) 	add	#-10, r7	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) 	bt	9f		! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) 	add	#3,r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) 	! 3 cycles, 1 byte per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) 1:	mov.b	@(r0,r5),r1	!  19 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) 	cmp/hi	r7,r0		!  57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) 	bt/s	1b		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) 	 mov.b	r1,@-r0		!  28 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) 9:	rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) 	 nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) 	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) 	.balign	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) .Lcase2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) 	! Size is 16 or greater and less then 64, but may have trailing bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) 2:	mov	r5, r6		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) 	add	#-2,r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) 	mov	r4,r2		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) 	add	#-4,r6		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) 	add	#7,r2		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) 3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) 	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) 	cmp/hi	r2,r0		!  57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) 	mov.w	r1,@-r0		!  29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) 	bt/s	3b		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) 	 mov.w	r3,@-r0		!  29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) 	bra	10f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) 	 nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) 	.balign	32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) .Lcase2b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) 	! Size is at least 64 bytes, so will be going round the big loop at least once.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) 	!   r2 = rounded up r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) 	!   r3 = rounded down r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) 	mov	r0, r3		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) 	mov	#(~0x1f), r1	!   6 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) 	and	r1, r3		!  78 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) 	mov	r4, r2		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) 	cmp/eq	r3, r0		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) 	add	#0x1f, r2	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) 	add	#-2, r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) 	bt/s	1f		! 110 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) 	 and	r1, r2		!  78 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) 	! Copy a short word one at a time until we are cache line aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) 	!   Normal values: r0, r2, r3, r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) 	!   Unused: r1, r6, r7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) 	!   Mod: r5 (=r5-2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) 	add	#2, r3		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) 2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) 	cmp/eq	r3,r0		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) 	bf/s	2b		! 111 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) 	 mov.w	r1,@-r0		!  29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) 	! Copy the cache line aligned blocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) 	! In use: r0, r2, r4, r5 (=r5-2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) 	! Scratch: r1, r3, r6, r7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) 	! We could do this with the four scratch registers, but if src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) 	! and dest hit the same cache line, this will thrash, so make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) 	! use of additional registers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) 	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) 	!   r5:	 src (was r0+r5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) 	!   r1:	 dest (was r0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) 	! this can be reversed at the end, so we don't need to save any extra
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) 	! state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) 1:	mov.l	r8, @-r15	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) 	add	r0, r5		!  49 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) 	mov.l	r9, @-r15	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) 	mov	r0, r1		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) 	mov.l	r10, @-r15	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) 	add	#-0x1e, r5	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) 	mov.l	r11, @-r15	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) 	mov.l	r12, @-r15	!  30 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) 	! 17 cycles, 32 bytes per iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) #ifdef CONFIG_CPU_LITTLE_ENDIAN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) 2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) 	add	#-0x20, r1	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) 	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) 	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) 	shll16	r0		! 103 EX			JI..
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) 	mov.l	@r5+, r7	!  15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) 	xtrct	r3, r0		!  48 EX			LKJI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) 	mov.l	@r5+, r8	!  15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) 	xtrct	r6, r3		!  48 EX			PONM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) 	mov.l	@r5+, r9	!  15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) 	xtrct	r7, r6		!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) 	mov.l	@r5+, r10	!  15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) 	xtrct	r8, r7		!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) 	mov.l	@r5+, r11	!  15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) 	xtrct	r9, r8		!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) 	mov.w	@r5+, r12	!  15 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) 	xtrct	r10, r9		!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) 	movca.l	r0,@r1		!  40 LS (latency=3-7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) 	xtrct	r11, r10	!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) 	mov.l	r3, @(0x04,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) 	xtrct	r12, r11	!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) 	mov.l	r6, @(0x08,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) 	mov.l	r7, @(0x0c,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) 	mov.l	r8, @(0x10,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) 	add	#-0x40, r5	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) 	mov.l	r9, @(0x14,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) 	cmp/eq	r2,r1		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) 	mov.l	r10, @(0x18,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) 	bf/s	2b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) 	 mov.l	r11, @(0x1c,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) 2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) 	add	#-2, r5		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) 	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) 	add	#-4, r1		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) 	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) 	shll16	r0		! 103 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) 	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) 	xtrct	r3, r0		!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) 	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) 	xtrct	r6, r3		!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) 	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) 	xtrct	r7, r6		!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) 	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) 	xtrct	r8, r7		!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) 	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) 	xtrct	r9, r8		!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) 	mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732)     	xtrct	r10, r9		!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) 	movca.l	r0,@r1		!  40 LS (latency=3-7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) 	add	#-0x1c, r1	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) 	mov.l	r3, @(0x18,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) 	xtrct	r11, r10	!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) 	mov.l	r6, @(0x14,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) 	xtrct	r12, r11	!  48 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) 	mov.l	r7, @(0x10,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) 	mov.l	r8, @(0x0c,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) 	add	#-0x1e, r5	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) 	mov.l	r9, @(0x08,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) 	cmp/eq	r2,r1		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) 	mov.l	r10, @(0x04,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) 	bf/s	2b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) 	 mov.l	r11, @(0x00,r1)	!  33 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) 	mov.l	@r15+, r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) 	mov	r1, r0		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) 	mov.l	@r15+, r11	!  15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) 	sub	r1, r5		!  75 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) 	mov.l	@r15+, r10	!  15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) 	cmp/eq	r4, r0		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) 	bf/s	1f		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) 	 mov.l	 @r15+, r9	!  15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) 	rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) 1:	 mov.l	@r15+, r8	!  15 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) 	add	#0x1e, r5	!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) 	! Finish off a short word at a time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) 	! r5 must be invariant - 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) 10:	mov	r4,r2		!   5 MT (latency=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) 	add	#1,r2		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) 	cmp/hi	r2, r0		!  57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) 	bf/s	1f		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) 	 add	#2, r2		!  50 EX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) 3:	mov.w	@(r0,r5),r1	!  20 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) 	cmp/hi	r2,r0		!  57 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) 	bt/s	3b		! 109 BR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) 	 mov.w	r1,@-r0		!  29 LS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) 	!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) 	! Finally, copy the last byte if necessary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) 	cmp/eq	r4,r0		!  54 MT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) 	bt/s	9b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) 	 add	#1,r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) 	mov.b	@(r0,r5),r1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) 	rts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) 	 mov.b	r1,@-r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800)