Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3)  * arch/alpha/lib/ev6-memset.S
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5)  * This is an efficient (and relatively small) implementation of the C library
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6)  * "memset()" function for the 21264 implementation of Alpha.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8)  * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10)  * Much of the information about 21264 scheduling/coding comes from:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11)  *	Compiler Writer's Guide for the Alpha 21264
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12)  *	abbreviated as 'CWG' in other comments here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13)  *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14)  * Scheduling notation:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15)  *	E	- either cluster
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16)  *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17)  *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18)  * The algorithm for the leading and trailing quadwords remains the same,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19)  * however the loop has been unrolled to enable better memory throughput,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20)  * and the code has been replicated for each of the entry points: __memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21)  * and __memset16 to permit better scheduling to eliminate the stalling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22)  * encountered during the mask replication.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23)  * A future enhancement might be to put in a byte store loop for really
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24)  * small (say < 32 bytes) memset()s.  Whether or not that change would be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25)  * a win in the kernel would depend upon the contextual usage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26)  * WARNING: Maintaining this is going to be more work than the above version,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27)  * as fixes will need to be made in multiple places.  The performance gain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28)  * is worth it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) 	.set noat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) 	.set noreorder
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) 	.globl memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) 	.globl __memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) 	.globl ___memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) 	.globl __memset16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) 	.globl __constant_c_memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) 	.ent ___memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) ___memset:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) 	.frame $30,0,$26,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) 	.prologue 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) 	 * Serious stalling happens.  The only way to mitigate this is to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) 	 * undertake a major re-write to interleave the constant materialization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) 	 * with other parts of the fall-through code.  This is important, even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) 	 * though it makes maintenance tougher.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) 	 * Do this later.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53) 	and $17,255,$1		# E : 00000000000000ch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54) 	insbl $17,1,$2		# U : 000000000000ch00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55) 	bis $16,$16,$0		# E : return value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56) 	ble $18,end_b		# U : zero length requested?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) 	addq $18,$16,$6		# E : max address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) 	bis	$1,$2,$17	# E : 000000000000chch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) 	insbl	$1,2,$3		# U : 0000000000ch0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) 	insbl	$1,3,$4		# U : 00000000ch000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) 	or	$3,$4,$3	# E : 00000000chch0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) 	inswl	$17,4,$5	# U : 0000chch00000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) 	xor	$16,$6,$1	# E : will complete write be within one quadword?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) 	inswl	$17,6,$2	# U : chch000000000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68) 	or	$17,$3,$17	# E : 00000000chchchch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69) 	or	$2,$5,$2	# E : chchchch00000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) 	bic	$1,7,$1		# E : fit within a single quadword?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) 	and	$16,7,$3	# E : Target addr misalignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) 	or	$17,$2,$17	# E : chchchchchchchch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) 	beq	$1,within_quad_b # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) 	beq	$3,aligned_b	# U : target is 0mod8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) 	 * Target address is misaligned, and won't fit within a quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) 	ldq_u $4,0($16)		# L : Fetch first partial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) 	bis $16,$16,$5		# E : Save the address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) 	insql $17,$16,$2	# U : Insert new bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) 	subq $3,8,$3		# E : Invert (for addressing uses)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) 	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) 	mskql $4,$16,$4		# U : clear relevant parts of the quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) 	subq $16,$3,$16		# E : $16 is new aligned destination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) 	bis $2,$4,$1		# E : Final bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) 	stq_u $1,0($5)		# L : Store result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) aligned_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) 	 * We are now guaranteed to be quad aligned, with at least
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) 	 * one partial quad to write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) 	sra $18,3,$3		# U : Number of remaining quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) 	and $18,7,$18		# E : Number of trailing bytes to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) 	bis $16,$16,$5		# E : Save dest address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) 	beq $3,no_quad_b	# U : tail stuff only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) 	 * it's worth the effort to unroll this and use wh64 if possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) 	 * Lifted a bunch of code from clear_user.S
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) 	 * At this point, entry values are:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) 	 * $16	Current destination address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 	 * $5	A copy of $16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) 	 * $6	The max quadword address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) 	 * $18	Number trailer bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 	 * $3	Number quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) 	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) 	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) 	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) 	blt	$4, loop_b	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) 	 * We know we've got at least 16 quads, minimum of one trip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) 	 * through unrolled loop.  Do a quad at a time to get us 0mod64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) 	 * aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) 	beq	$1, $bigalign_b	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) $alignmod64_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) 	stq	$17, 0($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) 	subq	$3, 1, $3	# E : For consistency later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) 	addq	$1, 8, $1	# E : Increment towards zero for alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) 	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) 	addq	$5, 8, $5	# E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) 	blt	$1, $alignmod64_b # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) $bigalign_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 	 * $3 - number quads left to go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 	 * $5 - target address (aligned 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 	 * $17 - mask of stuff to store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) 	 * Scratch registers available: $7, $2, $4, $1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 	 * we know that we'll be taking a minimum of one trip through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153)  	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) 	 * The wh64 is issued on for the starting destination address for trip +2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 	 * through the loop, and if there are less than two trips left, the target
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) 	 * address will be for the current trip.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) $do_wh64_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) 	wh64	($4)		# L1 : memory subsystem write hint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) 	subq	$3, 24, $2	# E : For determining future wh64 addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 	stq	$17, 0($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) 	addq	$5, 128, $4	# E : speculative target of next wh64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) 	stq	$17, 8($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 	stq	$17, 16($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) 	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 	stq	$17, 24($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 	stq	$17, 32($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) 	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) 	stq	$17, 40($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) 	stq	$17, 48($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) 	subq	$3, 16, $2	# E : Repeat the loop at least once more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) 	stq	$17, 56($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 	addq	$5, 64, $5	# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 	subq	$3, 8, $3	# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) 	bge	$2, $do_wh64_b	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) 	beq	$3, no_quad_b	# U : Might have finished already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 	 * Simple loop for trailing quadwords, or for small amounts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 	 * of data (where we can't use an unrolled loop and wh64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) loop_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 	stq $17,0($5)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) 	subq $3,1,$3		# E : Decrement number quads left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 	addq $5,8,$5		# E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) 	bne $3,loop_b		# U : more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) no_quad_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) 	 * Write 0..7 trailing bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) 	beq $18,end_b		# U : All done?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 	ldq $7,0($5)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 	mskqh $7,$6,$2		# U : Mask final quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) 	insqh $17,$6,$4		# U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) 	bis $2,$4,$1		# E : Put it all together
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) 	stq $1,0($5)		# L : And back to memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) 	ret $31,($26),1		# L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) within_quad_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) 	ldq_u $1,0($16)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) 	insql $17,$16,$2	# U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) 	mskql $1,$16,$4		# U : Clear old
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) 	bis $2,$4,$2		# E : New result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) 	mskql $2,$6,$4		# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) 	mskqh $1,$6,$2		# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) 	bis $2,$4,$1		# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) 	stq_u $1,0($16)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) end_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) 	ret $31,($26),1		# L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 	.end ___memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) 	EXPORT_SYMBOL(___memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) 	 * This is the original body of code, prior to replication and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) 	 * rescheduling.  Leave it here, as there may be calls to this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) 	 * entry point.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) 	.ent __constant_c_memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) __constant_c_memset:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) 	.frame $30,0,$26,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) 	.prologue 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) 	addq $18,$16,$6		# E : max address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) 	bis $16,$16,$0		# E : return value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) 	xor $16,$6,$1		# E : will complete write be within one quadword?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) 	ble $18,end		# U : zero length requested?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) 	bic $1,7,$1		# E : fit within a single quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) 	beq $1,within_one_quad	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) 	and $16,7,$3		# E : Target addr misalignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) 	beq $3,aligned		# U : target is 0mod8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) 	 * Target address is misaligned, and won't fit within a quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) 	ldq_u $4,0($16)		# L : Fetch first partial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) 	bis $16,$16,$5		# E : Save the address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) 	insql $17,$16,$2	# U : Insert new bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) 	subq $3,8,$3		# E : Invert (for addressing uses)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) 	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) 	mskql $4,$16,$4		# U : clear relevant parts of the quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) 	subq $16,$3,$16		# E : $16 is new aligned destination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) 	bis $2,$4,$1		# E : Final bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) 	stq_u $1,0($5)		# L : Store result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) aligned:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) 	 * We are now guaranteed to be quad aligned, with at least
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) 	 * one partial quad to write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) 	sra $18,3,$3		# U : Number of remaining quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) 	and $18,7,$18		# E : Number of trailing bytes to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) 	bis $16,$16,$5		# E : Save dest address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) 	beq $3,no_quad		# U : tail stuff only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) 	 * it's worth the effort to unroll this and use wh64 if possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) 	 * Lifted a bunch of code from clear_user.S
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) 	 * At this point, entry values are:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) 	 * $16	Current destination address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) 	 * $5	A copy of $16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 	 * $6	The max quadword address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) 	 * $18	Number trailer bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 	 * $3	Number quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) 	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) 	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) 	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) 	blt	$4, loop	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) 	 * We know we've got at least 16 quads, minimum of one trip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) 	 * through unrolled loop.  Do a quad at a time to get us 0mod64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) 	 * aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) 	beq	$1, $bigalign	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) $alignmod64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) 	stq	$17, 0($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) 	subq	$3, 1, $3	# E : For consistency later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) 	addq	$1, 8, $1	# E : Increment towards zero for alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) 	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) 	addq	$5, 8, $5	# E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) 	blt	$1, $alignmod64	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) $bigalign:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) 	 * $3 - number quads left to go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) 	 * $5 - target address (aligned 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) 	 * $17 - mask of stuff to store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) 	 * Scratch registers available: $7, $2, $4, $1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) 	 * we know that we'll be taking a minimum of one trip through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)  	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) 	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) 	 * The wh64 is issued on for the starting destination address for trip +2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) 	 * through the loop, and if there are less than two trips left, the target
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) 	 * address will be for the current trip.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) $do_wh64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) 	wh64	($4)		# L1 : memory subsystem write hint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) 	subq	$3, 24, $2	# E : For determining future wh64 addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) 	stq	$17, 0($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) 	addq	$5, 128, $4	# E : speculative target of next wh64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) 	stq	$17, 8($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) 	stq	$17, 16($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) 	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) 	stq	$17, 24($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) 	stq	$17, 32($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) 	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) 	stq	$17, 40($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) 	stq	$17, 48($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) 	subq	$3, 16, $2	# E : Repeat the loop at least once more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) 	stq	$17, 56($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) 	addq	$5, 64, $5	# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) 	subq	$3, 8, $3	# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) 	bge	$2, $do_wh64	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) 	beq	$3, no_quad	# U : Might have finished already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) 	 * Simple loop for trailing quadwords, or for small amounts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) 	 * of data (where we can't use an unrolled loop and wh64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) 	stq $17,0($5)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 	subq $3,1,$3		# E : Decrement number quads left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) 	addq $5,8,$5		# E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) 	bne $3,loop		# U : more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) no_quad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) 	 * Write 0..7 trailing bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) 	beq $18,end		# U : All done?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) 	ldq $7,0($5)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) 	mskqh $7,$6,$2		# U : Mask final quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) 	insqh $17,$6,$4		# U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) 	bis $2,$4,$1		# E : Put it all together
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) 	stq $1,0($5)		# L : And back to memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) 	ret $31,($26),1		# L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) within_one_quad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) 	ldq_u $1,0($16)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) 	insql $17,$16,$2	# U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) 	mskql $1,$16,$4		# U : Clear old
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) 	bis $2,$4,$2		# E : New result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) 	mskql $2,$6,$4		# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) 	mskqh $1,$6,$2		# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) 	bis $2,$4,$1		# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) 	stq_u $1,0($16)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) end:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) 	ret $31,($26),1		# L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) 	.end __constant_c_memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) 	EXPORT_SYMBOL(__constant_c_memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) 	 * This is a replicant of the __constant_c_memset code, rescheduled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) 	 * to mask stalls.  Note that entry point names also had to change
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) 	.align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) 	.ent __memset16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) __memset16:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) 	.frame $30,0,$26,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) 	.prologue 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) 	inswl $17,0,$5		# U : 000000000000c1c2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) 	inswl $17,2,$2		# U : 00000000c1c20000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) 	bis $16,$16,$0		# E : return value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) 	addq	$18,$16,$6	# E : max address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) 	ble $18, end_w		# U : zero length requested?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) 	inswl	$17,4,$3	# U : 0000c1c200000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) 	inswl	$17,6,$4	# U : c1c2000000000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) 	xor	$16,$6,$1	# E : will complete write be within one quadword?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) 	or	$2,$5,$2	# E : 00000000c1c2c1c2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) 	or	$3,$4,$17	# E : c1c2c1c200000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) 	bic	$1,7,$1		# E : fit within a single quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) 	and	$16,7,$3	# E : Target addr misalignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) 	or	$17,$2,$17	# E : c1c2c1c2c1c2c1c2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) 	beq $1,within_quad_w	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) 	beq $3,aligned_w	# U : target is 0mod8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) 	 * Target address is misaligned, and won't fit within a quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) 	ldq_u $4,0($16)		# L : Fetch first partial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) 	bis $16,$16,$5		# E : Save the address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) 	insql $17,$16,$2	# U : Insert new bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) 	subq $3,8,$3		# E : Invert (for addressing uses)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) 	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) 	mskql $4,$16,$4		# U : clear relevant parts of the quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) 	subq $16,$3,$16		# E : $16 is new aligned destination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) 	bis $2,$4,$1		# E : Final bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) 	stq_u $1,0($5)		# L : Store result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) aligned_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) 	 * We are now guaranteed to be quad aligned, with at least
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) 	 * one partial quad to write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) 	sra $18,3,$3		# U : Number of remaining quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 	and $18,7,$18		# E : Number of trailing bytes to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) 	bis $16,$16,$5		# E : Save dest address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) 	beq $3,no_quad_w	# U : tail stuff only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) 	 * it's worth the effort to unroll this and use wh64 if possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) 	 * Lifted a bunch of code from clear_user.S
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) 	 * At this point, entry values are:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) 	 * $16	Current destination address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) 	 * $5	A copy of $16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) 	 * $6	The max quadword address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) 	 * $18	Number trailer bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) 	 * $3	Number quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) 	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) 	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) 	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) 	blt	$4, loop_w	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) 	 * We know we've got at least 16 quads, minimum of one trip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) 	 * through unrolled loop.  Do a quad at a time to get us 0mod64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) 	 * aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) 	beq	$1, $bigalign_w	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) $alignmod64_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) 	stq	$17, 0($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) 	subq	$3, 1, $3	# E : For consistency later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) 	addq	$1, 8, $1	# E : Increment towards zero for alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) 	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) 	addq	$5, 8, $5	# E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) 	blt	$1, $alignmod64_w	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) $bigalign_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) 	 * $3 - number quads left to go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) 	 * $5 - target address (aligned 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) 	 * $17 - mask of stuff to store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) 	 * Scratch registers available: $7, $2, $4, $1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) 	 * we know that we'll be taking a minimum of one trip through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519)  	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) 	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) 	 * The wh64 is issued on for the starting destination address for trip +2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) 	 * through the loop, and if there are less than two trips left, the target
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) 	 * address will be for the current trip.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) $do_wh64_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) 	wh64	($4)		# L1 : memory subsystem write hint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) 	subq	$3, 24, $2	# E : For determining future wh64 addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) 	stq	$17, 0($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) 	addq	$5, 128, $4	# E : speculative target of next wh64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) 	stq	$17, 8($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) 	stq	$17, 16($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) 	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) 	stq	$17, 24($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) 	stq	$17, 32($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) 	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) 	stq	$17, 40($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) 	stq	$17, 48($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) 	subq	$3, 16, $2	# E : Repeat the loop at least once more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) 	stq	$17, 56($5)	# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) 	addq	$5, 64, $5	# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) 	subq	$3, 8, $3	# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) 	bge	$2, $do_wh64_w	# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) 	beq	$3, no_quad_w	# U : Might have finished already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) 	 * Simple loop for trailing quadwords, or for small amounts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) 	 * of data (where we can't use an unrolled loop and wh64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) loop_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) 	stq $17,0($5)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) 	subq $3,1,$3		# E : Decrement number quads left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) 	addq $5,8,$5		# E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) 	bne $3,loop_w		# U : more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) no_quad_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) 	 * Write 0..7 trailing bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) 	nop			# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) 	beq $18,end_w		# U : All done?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) 	ldq $7,0($5)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) 	mskqh $7,$6,$2		# U : Mask final quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) 	insqh $17,$6,$4		# U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) 	bis $2,$4,$1		# E : Put it all together
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) 	stq $1,0($5)		# L : And back to memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) 	ret $31,($26),1		# L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) within_quad_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) 	ldq_u $1,0($16)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) 	insql $17,$16,$2	# U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) 	mskql $1,$16,$4		# U : Clear old
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) 	bis $2,$4,$2		# E : New result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) 	mskql $2,$6,$4		# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) 	mskqh $1,$6,$2		# U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) 	bis $2,$4,$1		# E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) 	stq_u $1,0($16)		# L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) end_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) 	nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) 	ret $31,($26),1		# L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) 	.end __memset16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) 	EXPORT_SYMBOL(__memset16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) memset = ___memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) __memset = ___memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) 	EXPORT_SYMBOL(memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) 	EXPORT_SYMBOL(__memset)