Orange Pi5 kernel

^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3)  * This routine clears to zero a linear memory buffer in user space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5)  * Inputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6)  *	in0:	address of buffer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7)  *	in1:	length of buffer in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8)  * Outputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9)  *	r8:	number of bytes that didn't get cleared due to a fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11)  * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12)  *	Stephane Eranian <eranian@hpl.hp.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15) #include <asm/asmmacro.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) // arguments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) #define buf		r32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) #define len		r33
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) // local registers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) #define cnt		r16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) #define buf2		r17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) #define saved_lc	r18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) #define saved_pfs	r19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) #define tmp		r20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) #define len2		r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) #define len3		r22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) // Theory of operations:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) //	- we check whether or not the buffer is small, i.e., less than 17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) //	  in which case we do the byte by byte loop.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) //	- Otherwise we go progressively from 1 byte store to 8byte store in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) //	  the head part, the body is a 16byte store loop and we finish we the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) //	  tail for the last 15 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) //	  The good point about this breakdown is that the long buffer handling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) //	  contains only 2 branches.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) //	The reason for not using shifting & masking for both the head and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) //	tail is to stay semantically correct. This routine is not supposed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) //	to write bytes outside of the buffer. While most of the time this would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) //	be ok, we can't tolerate a mistake. A classical example is the case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) //	of multithreaded code were to the extra bytes touched is actually owned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) //	by another thread which runs concurrently to ours. Another, less likely,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) //	example is with device drivers where reading an I/O mapped location may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) //	have side effects (same thing for writing).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) GLOBAL_ENTRY(__do_clear_user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) 	.prologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) 	.save ar.pfs, saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) 	alloc	saved_pfs=ar.pfs,2,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) 	cmp.eq p6,p0=r0,len		// check for zero length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) 	.save ar.lc, saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) 	mov saved_lc=ar.lc		// preserve ar.lc (slow)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) 	.body
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) 	;;				// avoid WAW on CFM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) 	adds tmp=-1,len			// br.ctop is repeat/until
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) 	mov ret0=len			// return value is length at this point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) (p6)	br.ret.spnt.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69) 	cmp.lt p6,p0=16,len		// if len > 16 then long memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) 	mov ar.lc=tmp			// initialize lc for small count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) (p6)	br.cond.dptk .long_do_clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) 	;;				// WAR on ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) 	// worst case 16 iterations, avg 8 iterations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) 	// We could have played with the predicates to use the extra
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) 	// M slot for 2 stores/iteration but the cost the initialization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) 	// the various counters compared to how long the loop is supposed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) 	// to last on average does not make this solution viable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) 	EX( .Lexit1, st1 [buf]=r0,1 )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) 	adds len=-1,len			// countdown length using len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) 	br.cloop.dptk 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) 	;;				// avoid RAW on ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) 	// .Lexit4: comes from byte by byte loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) 	//	    len contains bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) .Lexit1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) 	mov ret0=len			// faster than using ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) 	mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) 	br.ret.sptk.many rp		// end of short clear_user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) 	// At this point we know we have more than 16 bytes to copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) 	// so we focus on alignment (no branches required)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) 	// The use of len/len2 for countdown of the number of bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) 	// instead of ret0 is due to the fact that the exception code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) 	// changes the values of r8.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) .long_do_clear:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) 	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) 	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) (p6)	adds len=-1,len;;		// sync because buf is modified
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 	tbit.nz p6,p0=buf,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) 	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) (p6)	adds len=-2,len;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) 	tbit.nz p6,p0=buf,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) 	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) (p6)	adds len=-4,len;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 	tbit.nz p6,p0=buf,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) 	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) (p6)	adds len=-8,len;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) 	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) 	cmp.eq p6,p0=r0,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) 	adds tmp=-1,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) (p6)	br.cond.dpnt .dotail		// we have less than 16 bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) 	adds buf2=8,buf			// setup second base pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) 	mov ar.lc=tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) 	// 16bytes/iteration core loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) 	// The second store can never generate a fault because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) 	// we come into the loop only when we are 16-byte aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) 	// This means that if we cross a page then it will always be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) 	// in the first store and never in the second.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) 	// We need to keep track of the remaining length. A possible (optimistic)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) 	// way would be to use ar.lc and derive how many byte were left by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) 	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) 	// every iteration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) 	// However we need to keep the synchronization point. A template
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) 	// M;;MB does not exist and thus we can keep the addition at no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) 	// extra cycle cost (use a nop slot anyway). It also simplifies the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) 	// (unlikely)  error recovery code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 2:	EX(.Lexit3, st8 [buf]=r0,16 )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 	;;				// needed to get len correct when error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) 	st8 [buf2]=r0,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 	adds len=-16,len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) 	br.cloop.dptk 2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) 	mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) 	// tail correction based on len only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) 	// We alternate the use of len3,len2 to allow parallelism and correct
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) 	// error handling. We also reuse p6/p7 to return correct value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) 	// The addition of len2/len3 does not cost anything more compared to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) 	// the regular memset as we had empty slots.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) .dotail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) 	mov len2=len			// for parallelization of error handling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) 	mov len3=len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) 	tbit.nz p6,p0=len,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) 	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) (p6)	adds len3=-8,len2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 	tbit.nz p7,p6=len,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) 	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) (p7)	adds len2=-4,len3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 	tbit.nz p6,p7=len,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) 	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) (p6)	adds len3=-2,len2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 	tbit.nz p7,p6=len,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) 	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 	mov ret0=r0				// success
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 	br.ret.sptk.many rp			// end of most likely path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 	// Outlined error handling code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) 	// .Lexit3: comes from core loop, need restore pr/lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) 	//	    len contains bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 	// .Lexit2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 	//	if p6 -> coming from st8 or st2 : len2 contains what's left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 	//	if p7 -> coming from st4 or st1 : len3 contains what's left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 	// We must restore lc/pr even though might not have been used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) .Lexit2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 	.pred.rel "mutex", p6, p7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) (p6)	mov len=len2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) (p7)	mov len=len3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) 	;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) 	// .Lexit4: comes from head, need not restore pr/lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) 	//	    len contains bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 	//
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) .Lexit3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 	mov ret0=len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 	mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) 	br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) END(__do_clear_user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) EXPORT_SYMBOL(__do_clear_user)