Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3)  * Itanium 2-optimized version of memcpy and copy_user function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5)  * Inputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6)  * 	in0:	destination address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7)  *	in1:	source address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8)  *	in2:	number of bytes to copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9)  * Output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10)  *	for memcpy:    return dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11)  * 	for copy_user: return 0 if success,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12)  *		       or number of byte NOT copied if error occurred.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14)  * Copyright (C) 2002 Intel Corp.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15)  * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17) #include <asm/asmmacro.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18) #include <asm/page.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) #define EK(y...) EX(y)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) /* McKinley specific optimization */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) #define retval		r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) #define saved_pfs	r31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) #define saved_lc	r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) #define saved_pr	r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) #define saved_in0	r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) #define saved_in1	r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) #define saved_in2	r16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) #define src0		r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) #define src1		r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) #define dst0		r17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) #define dst1		r18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) #define cnt		r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) /* r19-r30 are temp for each code section */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) #define PREFETCH_DIST	8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) #define src_pre_mem	r19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) #define dst_pre_mem	r20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) #define src_pre_l2	r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) #define dst_pre_l2	r22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) #define t1		r23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) #define t2		r24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) #define t3		r25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) #define t4		r26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) #define t5		t1	// alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) #define t6		t2	// alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) #define t7		t3	// alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) #define n8		r27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) #define t9		t5	// alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) #define t10		t4	// alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) #define t11		t7	// alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) #define t12		t6	// alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) #define t14		t10	// alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) #define t13		r28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) #define t15		r29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) #define tmp		r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) /* defines for long_copy block */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) #define	A	0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) #define B	(PREFETCH_DIST)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) #define C	(B + PREFETCH_DIST)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) #define D	(C + 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) #define N	(D + 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68) #define Nrot	((N + 7) & ~7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) /* alias */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) #define in0		r32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) #define in1		r33
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) #define in2		r34
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) GLOBAL_ENTRY(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) 	and	r28=0x7,in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) 	and	r29=0x7,in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) 	mov	f6=f0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) 	mov	retval=in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) 	br.cond.sptk .common_code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) END(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) EXPORT_SYMBOL(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) GLOBAL_ENTRY(__copy_user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) 	.prologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) // check dest alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) 	and	r28=0x7,in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) 	and	r29=0x7,in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) 	mov	f6=f1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) 	mov	saved_in0=in0	// save dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) 	mov	saved_in1=in1	// save src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) 	mov	retval=r0	// initialize return value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) .common_code:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) 	cmp.gt	p15,p0=8,in2	// check for small size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) 	cmp.ne	p13,p0=0,r28	// check dest alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) 	cmp.ne	p14,p0=0,r29	// check src alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) 	add	src0=0,in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) 	sub	r30=8,r28	// for .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) 	mov	saved_in2=in2	// save len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) 	add	dst0=0,in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) 	add	dst1=1,in0	// dest odd index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) 	cmp.le	p6,p0 = 1,r30	// for .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) (p15)	br.cond.dpnt .memcpy_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) (p13)	br.cond.dpnt .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) (p14)	br.cond.dpnt .unaligned_src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) // both dest and src are aligned on 8-byte boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) .aligned_src:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) 	.save ar.pfs, saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 	alloc	saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) 	.save pr, saved_pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) 	mov	saved_pr=pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) 	shr.u	cnt=in2,7	// this much cache line
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) 	cmp.lt	p6,p0=2*PREFETCH_DIST,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) 	cmp.lt	p7,p8=1,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) 	.save ar.lc, saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) 	mov	saved_lc=ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) 	.body
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) 	add	cnt=-1,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) 	add	src_pre_mem=0,in1	// prefetch src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) 	add	dst_pre_mem=0,in0	// prefetch dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) (p7)	mov	ar.lc=cnt	// prefetch count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) (p8)	mov	ar.lc=r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) (p6)	br.cond.dpnt .long_copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) .prefetch:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) 	lfetch.fault	  [src_pre_mem], 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) 	lfetch.fault.excl [dst_pre_mem], 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) 	br.cloop.dptk.few .prefetch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) .medium_copy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) 	and	tmp=31,in2	// copy length after iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) 	shr.u	r29=in2,5	// number of 32-byte iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) 	add	dst1=8,dst0	// 2nd dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) 	add	cnt=-1,r29	// ctop iteration adjustment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) 	cmp.eq	p10,p0=r29,r0	// do we really need to loop?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) 	add	src1=8,src0	// 2nd src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) 	cmp.le	p6,p0=8,tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 	cmp.le	p7,p0=16,tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 	mov	ar.lc=cnt	// loop setup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) 	cmp.eq	p16,p17 = r0,r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 	mov	ar.ec=2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) (p10)	br.dpnt.few .aligned_src_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) 	TEXT_ALIGN(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) EX(.ex_handler, (p16)	ld8	r34=[src0],16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) EK(.ex_handler, (p16)	ld8	r38=[src1],16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) EX(.ex_handler, (p17)	st8	[dst0]=r33,16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) EK(.ex_handler, (p17)	st8	[dst1]=r37,16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) EX(.ex_handler, (p16)	ld8	r32=[src0],16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) EK(.ex_handler, (p16)	ld8	r36=[src1],16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) EX(.ex_handler, (p16)	st8	[dst0]=r34,16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) EK(.ex_handler, (p16)	st8	[dst1]=r38,16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) 	br.ctop.dptk.few 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) .aligned_src_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) EX(.ex_handler, (p6)	ld8	t1=[src0])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 	mov	ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 	mov	ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) EX(.ex_hndlr_s, (p7)	ld8	t2=[src1],8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) 	cmp.le	p8,p0=24,tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 	and	r21=-8,tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) EX(.ex_hndlr_s, (p8)	ld8	t3=[src1])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) EX(.ex_handler, (p6)	st8	[dst0]=t1)	// store byte 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 	and	in2=7,tmp	// remaining length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) EX(.ex_hndlr_d, (p7)	st8	[dst1]=t2,8)	// store byte 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) 	add	src0=src0,r21	// setting up src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 	add	dst0=dst0,r21	// setting up dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) EX(.ex_handler, (p8)	st8	[dst1]=t3)	// store byte 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) 	mov	pr=saved_pr,-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 	br.dptk.many .memcpy_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) /* code taken from copy_page_mck */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) .long_copy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) 	.rotr v[2*PREFETCH_DIST]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 	.rotp p[N]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 	mov src_pre_mem = src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 	mov pr.rot = 0x10000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 	mov ar.ec = 1				// special unrolled loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) 	mov dst_pre_mem = dst0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) 	add src_pre_l2 = 8*8, src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 	add dst_pre_l2 = 8*8, dst0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) 	add src0 = 8, src_pre_mem		// first t1 src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) 	mov ar.lc = 2*PREFETCH_DIST - 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) 	shr.u cnt=in2,7				// number of lines
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 	add src1 = 3*8, src_pre_mem		// first t3 src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) 	add dst0 = 8, dst_pre_mem		// first t1 dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 	add dst1 = 3*8, dst_pre_mem		// first t3 dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) 	and tmp=127,in2				// remaining bytes after this block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) 	add cnt = -(2*PREFETCH_DIST) - 1, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) 	// same as .line_copy loop, but with all predicated-off instructions removed:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) .prefetch_loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) EX(.ex_hndlr_lcpy_1, (p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) EK(.ex_hndlr_lcpy_1, (p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) 	br.ctop.sptk .prefetch_loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) 	cmp.eq p16, p0 = r0, r0			// reset p16 to 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) 	mov ar.lc = cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) 	mov ar.ec = N				// # of stages in pipeline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) .line_copy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) EX(.ex_handler,	(p[D])	ld8 t2 = [src0], 3*8)			// M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) EK(.ex_handler,	(p[D])	ld8 t4 = [src1], 3*8)			// M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) EX(.ex_handler_lcpy,	(p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2 prefetch dst from memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) EK(.ex_handler_lcpy,	(p[D])	st8 [dst_pre_l2] = n8, 128)		// M3 prefetch dst from L2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) EX(.ex_handler_lcpy,	(p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0 prefetch src from memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) EK(.ex_handler_lcpy,	(p[C])	ld8 n8 = [src_pre_l2], 128)		// M1 prefetch src from L2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) EX(.ex_handler,	(p[D])	st8 [dst0] =  t1, 8)			// M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) EK(.ex_handler,	(p[D])	st8 [dst1] =  t3, 8)			// M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) EX(.ex_handler,	(p[D])	ld8  t5 = [src0], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) EK(.ex_handler,	(p[D])	ld8  t7 = [src1], 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) EX(.ex_handler,	(p[D])	st8 [dst0] =  t2, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) EK(.ex_handler,	(p[D])	st8 [dst1] =  t4, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) EX(.ex_handler,	(p[D])	ld8  t6 = [src0], 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) EK(.ex_handler,	(p[D])	ld8 t10 = [src1], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) EX(.ex_handler,	(p[D])	st8 [dst0] =  t5, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) EK(.ex_handler,	(p[D])	st8 [dst1] =  t7, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) EX(.ex_handler,	(p[D])	ld8  t9 = [src0], 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) EK(.ex_handler,	(p[D])	ld8 t11 = [src1], 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) EX(.ex_handler,	(p[D])	st8 [dst0] =  t6, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) EK(.ex_handler,	(p[D])	st8 [dst1] = t10, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) EX(.ex_handler,	(p[D])	ld8 t12 = [src0], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) EK(.ex_handler,	(p[D])	ld8 t14 = [src1], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) EX(.ex_handler,	(p[D])	st8 [dst0] =  t9, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) EK(.ex_handler,	(p[D])	st8 [dst1] = t11, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) EX(.ex_handler,	(p[D])	ld8 t13 = [src0], 4*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) EK(.ex_handler,	(p[D])	ld8 t15 = [src1], 4*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) EX(.ex_handler,	(p[D])	st8 [dst0] = t12, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) EK(.ex_handler,	(p[D])	st8 [dst1] = t14, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) EX(.ex_handler,	(p[C])	ld8  t1 = [src0], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) EK(.ex_handler,	(p[C])	ld8  t3 = [src1], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) EX(.ex_handler,	(p[D])	st8 [dst0] = t13, 4*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) EK(.ex_handler,	(p[D])	st8 [dst1] = t15, 4*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) 	br.ctop.sptk .line_copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) 	add dst0=-8,dst0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) 	add src0=-8,src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) 	mov in2=tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) 	.restore sp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) 	br.sptk.many .medium_copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) #define BLOCK_SIZE	128*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) #define blocksize	r23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) #define curlen		r24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) // dest is on 8-byte boundary, src is not. We need to do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) // ld8-ld8, shrp, then st8.  Max 8 byte copy per cycle.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) .unaligned_src:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) 	.prologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) 	.save ar.pfs, saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) 	alloc	saved_pfs=ar.pfs,3,5,0,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) 	.save ar.lc, saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) 	mov	saved_lc=ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) 	.save pr, saved_pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) 	mov	saved_pr=pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 	.body
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) .4k_block:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) 	mov	saved_in0=dst0	// need to save all input arguments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) 	mov	saved_in2=in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) 	mov	blocksize=BLOCK_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 	cmp.lt	p6,p7=blocksize,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) 	mov	saved_in1=src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) (p6)	mov	in2=blocksize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) 	shr.u	r21=in2,7	// this much cache line
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) 	shr.u	r22=in2,4	// number of 16-byte iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) 	and	curlen=15,in2	// copy length after iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) 	and	r30=7,src0	// source alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) 	cmp.lt	p7,p8=1,r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) 	add	cnt=-1,r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) 	add	src_pre_mem=0,src0	// prefetch src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 	add	dst_pre_mem=0,dst0	// prefetch dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) 	and	src0=-8,src0		// 1st src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) (p7)	mov	ar.lc = cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) (p8)	mov	ar.lc = r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) 	TEXT_ALIGN(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) 1:	lfetch.fault	  [src_pre_mem], 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) 	lfetch.fault.excl [dst_pre_mem], 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) 	br.cloop.dptk.few 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) 	shladd	dst1=r22,3,dst0	// 2nd dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) 	shladd	src1=r22,3,src0	// 2nd src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) 	cmp.eq	p8,p9=r22,r0	// do we really need to loop?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) 	cmp.le	p6,p7=8,curlen;	// have at least 8 byte remaining?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) 	add	cnt=-1,r22	// ctop iteration adjustment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) EX(.ex_handler, (p9)	ld8	r33=[src0],8)	// loop primer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) EK(.ex_handler, (p9)	ld8	r37=[src1],8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) (p8)	br.dpnt.few .noloop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) // The jump address is calculated based on src alignment. The COPYU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) // macro below need to confine its size to power of two, so an entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) // can be caulated using shl instead of an expensive multiply. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) // size is then hard coded by the following #define to match the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) // actual size.  This make it somewhat tedious when COPYU macro gets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) // changed and this need to be adjusted to match.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) #define LOOP_SIZE 6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) 	mov	r29=ip		// jmp_table thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) 	mov	ar.lc=cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) 	add	r29=.jump_table - 1b - (.jmp1-.jump_table), r29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) 	shl	r28=r30, LOOP_SIZE	// jmp_table thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) 	mov	ar.ec=2		// loop setup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) 	add	r29=r29,r28		// jmp_table thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) 	cmp.eq	p16,p17=r0,r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) 	mov	b6=r29			// jmp_table thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) 	br.cond.sptk.few b6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) // for 8-15 byte case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) // We will skip the loop, but need to replicate the side effect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) // that the loop produces.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) .noloop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) EX(.ex_handler, (p6)	ld8	r37=[src1],8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) 	add	src0=8,src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) (p6)	shl	r25=r30,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) EX(.ex_handler, (p6)	ld8	r27=[src1])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) (p6)	shr.u	r28=r37,r25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) (p6)	sub	r26=64,r25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) (p6)	shl	r27=r27,r26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) (p6)	or	r21=r28,r27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) .unaligned_src_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) /* check if we have more than blocksize to copy, if so go back */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) 	cmp.gt	p8,p0=saved_in2,blocksize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) (p8)	add	dst0=saved_in0,blocksize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) (p8)	add	src0=saved_in1,blocksize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) (p8)	sub	in2=saved_in2,blocksize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) (p8)	br.dpnt	.4k_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) /* we have up to 15 byte to copy in the tail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378)  * part of work is already done in the jump table code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379)  * we are at the following state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)  * src side:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381)  * 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382)  *   xxxxxx xx                   <----- r21 has xxxxxxxx already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)  * -------- -------- --------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384)  * 0        8        16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)  *          ^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)  *          |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387)  *          src1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)  * 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)  * dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390)  * -------- -------- --------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391)  * ^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392)  * |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393)  * dst1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) EX(.ex_handler, (p6)	st8	[dst1]=r21,8)	// more than 8 byte to copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) (p6)	add	curlen=-8,curlen	// update length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) 	mov	ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) 	mov	ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) 	mov	pr=saved_pr,-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) 	mov	in2=curlen	// remaining length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) 	mov	dst0=dst1	// dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) 	add	src0=src1,r30	// forward by src alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) // 7 byte or smaller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) .memcpy_short:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) 	cmp.le	p8,p9   = 1,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) 	cmp.le	p10,p11 = 2,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) 	cmp.le	p12,p13 = 3,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) 	cmp.le	p14,p15 = 4,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) 	add	src1=1,src0	// second src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) 	add	dst1=1,dst0	// second dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) EX(.ex_handler_short, (p8)	ld1	t1=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) EK(.ex_handler_short, (p10)	ld1	t2=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) (p9)	br.ret.dpnt rp		// 0 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) EX(.ex_handler_short, (p8)	st1	[dst0]=t1,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) EK(.ex_handler_short, (p10)	st1	[dst1]=t2,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) (p11)	br.ret.dpnt rp		// 1 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) EX(.ex_handler_short, (p12)	ld1	t3=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) EK(.ex_handler_short, (p14)	ld1	t4=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) (p13)	br.ret.dpnt rp		// 2 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) 	cmp.le	p6,p7   = 5,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) 	cmp.le	p8,p9   = 6,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) 	cmp.le	p10,p11 = 7,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) EX(.ex_handler_short, (p12)	st1	[dst0]=t3,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) EK(.ex_handler_short, (p14)	st1	[dst1]=t4,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) (p15)	br.ret.dpnt rp		// 3 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) EX(.ex_handler_short, (p6)	ld1	t5=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) EK(.ex_handler_short, (p8)	ld1	t6=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) (p7)	br.ret.dpnt rp		// 4 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) EX(.ex_handler_short, (p6)	st1	[dst0]=t5,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) EK(.ex_handler_short, (p8)	st1	[dst1]=t6,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) (p9)	br.ret.dptk rp		// 5 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) EX(.ex_handler_short, (p10)	ld1	t7=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) (p11)	br.ret.dptk rp		// 6 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) EX(.ex_handler_short, (p10)	st1	[dst0]=t7,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) 	br.ret.dptk rp		// done all cases
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) /* Align dest to nearest 8-byte boundary. We know we have at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457)  * least 7 bytes to copy, enough to crawl to 8-byte boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458)  * Actual number of byte to crawl depend on the dest alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459)  * 7 byte or less is taken care at .memcpy_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461)  * src0 - source even index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462)  * src1 - source  odd index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463)  * dst0 - dest even index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464)  * dst1 - dest  odd index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465)  * r30  - distance to 8-byte boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) .align_dest:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) 	add	src1=1,in1	// source odd index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 	cmp.le	p7,p0 = 2,r30	// for .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) 	cmp.le	p8,p0 = 3,r30	// for .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) EX(.ex_handler_short, (p6)	ld1	t1=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) 	cmp.le	p9,p0 = 4,r30	// for .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) 	cmp.le	p10,p0 = 5,r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) EX(.ex_handler_short, (p7)	ld1	t2=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) EK(.ex_handler_short, (p8)	ld1	t3=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) 	cmp.le	p11,p0 = 6,r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) EX(.ex_handler_short, (p6)	st1	[dst0] = t1,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) 	cmp.le	p12,p0 = 7,r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) EX(.ex_handler_short, (p9)	ld1	t4=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) EK(.ex_handler_short, (p10)	ld1	t5=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) EX(.ex_handler_short, (p7)	st1	[dst1] = t2,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) EK(.ex_handler_short, (p8)	st1	[dst0] = t3,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) EX(.ex_handler_short, (p11)	ld1	t6=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) EK(.ex_handler_short, (p12)	ld1	t7=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) 	cmp.eq	p6,p7=r28,r29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) EX(.ex_handler_short, (p9)	st1	[dst1] = t4,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) EK(.ex_handler_short, (p10)	st1	[dst0] = t5,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) 	sub	in2=in2,r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) EX(.ex_handler_short, (p11)	st1	[dst1] = t6,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) EK(.ex_handler_short, (p12)	st1	[dst0] = t7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) 	add	dst0=in0,r30	// setup arguments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) 	add	src0=in1,r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) (p6)	br.cond.dptk .aligned_src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) (p7)	br.cond.dpnt .unaligned_src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) /* main loop body in jump table format */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) #define COPYU(shift)									\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) 1:											\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) EX(.ex_handler,  (p16)	ld8	r32=[src0],8);		/* 1 */				\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) EK(.ex_handler,  (p16)	ld8	r36=[src1],8);						\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) 		 (p17)	shrp	r35=r33,r34,shift;;	/* 1 */				\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) EX(.ex_handler,  (p6)	ld8	r22=[src1]);	/* common, prime for tail section */	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) 		 nop.m	0;								\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) 		 (p16)	shrp	r38=r36,r37,shift;					\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) EX(.ex_handler,  (p17)	st8	[dst0]=r35,8);		/* 1 */				\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) EK(.ex_handler,  (p17)	st8	[dst1]=r39,8);						\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) 		 br.ctop.dptk.few 1b;;							\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) 		 (p7)	add	src1=-8,src1;	/* back out for <8 byte case */		\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) 		 shrp	r21=r22,r38,shift;	/* speculative work */			\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) 		 br.sptk.few .unaligned_src_tail /* branch out of jump table */		\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) 		 ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) 	TEXT_ALIGN(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) .jump_table:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) 	COPYU(8)	// unaligned cases
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) .jmp1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) 	COPYU(16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) 	COPYU(24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) 	COPYU(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) 	COPYU(40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) 	COPYU(48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) 	COPYU(56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) #undef A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) #undef B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) #undef C
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) #undef D
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535)  * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536)  * instruction failed in the bundle.  The exception algorithm is that we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537)  * first figure out the faulting address, then detect if there is any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)  * progress made on the copy, if so, redo the copy from last known copied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539)  * location up to the faulting address (exclusive). In the copy_from_user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540)  * case, remaining byte in kernel buffer will be zeroed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542)  * Take copy_from_user as an example, in the code there are multiple loads
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)  * in a bundle and those multiple loads could span over two pages, the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544)  * faulting address is calculated as page_round_down(max(src0, src1)).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545)  * This is based on knowledge that if we can access one byte in a page, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546)  * can access any byte in that page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548)  * predicate used in the exception handler:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)  * p6-p7: direction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550)  * p10-p11: src faulting addr calculation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551)  * p12-p13: dst faulting addr calculation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) #define A	r19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) #define B	r20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) #define C	r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) #define D	r22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) #define F	r28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) #define saved_retval	loc0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) #define saved_rtlink	loc1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) #define saved_pfs_stack	loc2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) .ex_hndlr_s:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) 	add	src0=8,src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) 	br.sptk .ex_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) .ex_hndlr_d:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) 	add	dst0=8,dst0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) 	br.sptk .ex_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) .ex_hndlr_lcpy_1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) 	mov	src1=src_pre_mem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) 	mov	dst1=dst_pre_mem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) 	cmp.gtu	p10,p11=src_pre_mem,saved_in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) 	cmp.gtu	p12,p13=dst_pre_mem,saved_in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) (p10)	add	src0=8,saved_in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) (p11)	mov	src0=saved_in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) (p12)	add	dst0=8,saved_in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) (p13)	mov	dst0=saved_in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) 	br.sptk	.ex_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) .ex_handler_lcpy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) 	// in line_copy block, the preload addresses should always ahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) 	// of the other two src/dst pointers.  Furthermore, src1/dst1 should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) 	// always ahead of src0/dst0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) 	mov	src1=src_pre_mem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) 	mov	dst1=dst_pre_mem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) .ex_handler:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) 	mov	pr=saved_pr,-1		// first restore pr, lc, and pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) 	mov	ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) 	mov	ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) .ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) 	cmp.ltu	p6,p7=saved_in0, saved_in1	// get the copy direction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) 	cmp.ltu	p10,p11=src0,src1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) 	cmp.ltu	p12,p13=dst0,dst1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) 	fcmp.eq	p8,p0=f6,f0		// is it memcpy?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) 	mov	tmp = dst0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) (p11)	mov	src1 = src0		// pick the larger of the two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) (p13)	mov	dst0 = dst1		// make dst0 the smaller one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) (p13)	mov	dst1 = tmp		// and dst1 the larger one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) (p6)	dep	F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) (p7)	dep	F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) (p6)	cmp.le	p14,p0=dst0,saved_in0	// no progress has been made on store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) (p7)	cmp.le	p14,p0=src0,saved_in1	// no progress has been made on load
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) 	mov	retval=saved_in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) (p8)	ld1	tmp=[src1]		// force an oops for memcpy call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) (p8)	st1	[dst1]=r0		// force an oops for memcpy call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) (p14)	br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616)  * The remaining byte to copy is calculated as:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618)  * A =	(faulting_addr - orig_src)	-> len to faulting ld address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619)  *	or 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620)  * 	(faulting_addr - orig_dst)	-> len to faulting st address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621)  * B =	(cur_dst - orig_dst)		-> len copied so far
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622)  * C =	A - B				-> len need to be copied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623)  * D =	orig_len - A			-> len need to be left along
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) (p6)	sub	A = F, saved_in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) (p7)	sub	A = F, saved_in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) 	clrrrb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) 	alloc	saved_pfs_stack=ar.pfs,3,3,3,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) 	cmp.lt	p8,p0=A,r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) 	sub	B = dst0, saved_in0	// how many byte copied so far
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) (p8)	mov	A = 0;			// A shouldn't be negative, cap it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) 	sub	C = A, B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) 	sub	D = saved_in2, A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) 	cmp.gt	p8,p0=C,r0		// more than 1 byte?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) 	mov	r8=0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) 	mov	saved_retval = D
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) 	mov	saved_rtlink = b0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) 	add	out0=saved_in0, B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) 	add	out1=saved_in1, B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) 	mov	out2=C
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) (p8)	br.call.sptk.few b0=__copy_user	// recursive call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) 	add	saved_retval=saved_retval,r8	// above might return non-zero value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) 	mov	retval=saved_retval
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) 	mov	ar.pfs=saved_pfs_stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) 	mov	b0=saved_rtlink
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) 	br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) /* end of McKinley specific optimization */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) END(__copy_user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) EXPORT_SYMBOL(__copy_user)