^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * arch/alpha/lib/ev6-memset.S
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * This is an efficient (and relatively small) implementation of the C library
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * "memset()" function for the 21264 implementation of Alpha.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * Much of the information about 21264 scheduling/coding comes from:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * Compiler Writer's Guide for the Alpha 21264
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * abbreviated as 'CWG' in other comments here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * Scheduling notation:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * E - either cluster
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * The algorithm for the leading and trailing quadwords remains the same,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * however the loop has been unrolled to enable better memory throughput,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * and the code has been replicated for each of the entry points: __memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * and __memset16 to permit better scheduling to eliminate the stalling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * encountered during the mask replication.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * A future enhancement might be to put in a byte store loop for really
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * small (say < 32 bytes) memset()s. Whether or not that change would be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * a win in the kernel would depend upon the contextual usage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * WARNING: Maintaining this is going to be more work than the above version,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * as fixes will need to be made in multiple places. The performance gain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * is worth it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) .set noat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) .set noreorder
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) .text
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) .globl memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) .globl __memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) .globl ___memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) .globl __memset16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) .globl __constant_c_memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) .ent ___memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) ___memset:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) .frame $30,0,$26,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) .prologue 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * Serious stalling happens. The only way to mitigate this is to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * undertake a major re-write to interleave the constant materialization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * with other parts of the fall-through code. This is important, even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * though it makes maintenance tougher.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) * Do this later.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) and $17,255,$1 # E : 00000000000000ch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) insbl $17,1,$2 # U : 000000000000ch00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) bis $16,$16,$0 # E : return value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) ble $18,end_b # U : zero length requested?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) addq $18,$16,$6 # E : max address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) bis $1,$2,$17 # E : 000000000000chch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) insbl $1,2,$3 # U : 0000000000ch0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) insbl $1,3,$4 # U : 00000000ch000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) or $3,$4,$3 # E : 00000000chch0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) inswl $17,4,$5 # U : 0000chch00000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) xor $16,$6,$1 # E : will complete write be within one quadword?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) inswl $17,6,$2 # U : chch000000000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) or $17,$3,$17 # E : 00000000chchchch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) or $2,$5,$2 # E : chchchch00000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) bic $1,7,$1 # E : fit within a single quadword?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) and $16,7,$3 # E : Target addr misalignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) or $17,$2,$17 # E : chchchchchchchch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) beq $1,within_quad_b # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) beq $3,aligned_b # U : target is 0mod8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) * Target address is misaligned, and won't fit within a quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) ldq_u $4,0($16) # L : Fetch first partial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) bis $16,$16,$5 # E : Save the address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) insql $17,$16,$2 # U : Insert new bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) subq $3,8,$3 # E : Invert (for addressing uses)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) addq $18,$3,$18 # E : $18 is new count ($3 is negative)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) mskql $4,$16,$4 # U : clear relevant parts of the quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) subq $16,$3,$16 # E : $16 is new aligned destination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) bis $2,$4,$1 # E : Final bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) stq_u $1,0($5) # L : Store result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) aligned_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) * We are now guaranteed to be quad aligned, with at least
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) * one partial quad to write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) sra $18,3,$3 # U : Number of remaining quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) and $18,7,$18 # E : Number of trailing bytes to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) bis $16,$16,$5 # E : Save dest address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) beq $3,no_quad_b # U : tail stuff only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) * it's worth the effort to unroll this and use wh64 if possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) * Lifted a bunch of code from clear_user.S
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) * At this point, entry values are:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) * $16 Current destination address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) * $5 A copy of $16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * $6 The max quadword address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * $18 Number trailer bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * $3 Number quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) blt $4, loop_b # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) * We know we've got at least 16 quads, minimum of one trip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) * through unrolled loop. Do a quad at a time to get us 0mod64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) * aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) beq $1, $bigalign_b # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) $alignmod64_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) stq $17, 0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) subq $3, 1, $3 # E : For consistency later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) addq $1, 8, $1 # E : Increment towards zero for alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) addq $5, 8, $5 # E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) blt $1, $alignmod64_b # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) $bigalign_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) * $3 - number quads left to go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) * $5 - target address (aligned 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) * $17 - mask of stuff to store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) * Scratch registers available: $7, $2, $4, $1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) * we know that we'll be taking a minimum of one trip through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) * Assumes the wh64 needs to be for 2 trips through the loop in the future
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) * The wh64 is issued on for the starting destination address for trip +2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) * through the loop, and if there are less than two trips left, the target
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) * address will be for the current trip.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) $do_wh64_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) wh64 ($4) # L1 : memory subsystem write hint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) subq $3, 24, $2 # E : For determining future wh64 addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) stq $17, 0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) addq $5, 128, $4 # E : speculative target of next wh64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) stq $17, 8($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) stq $17, 16($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) stq $17, 24($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) stq $17, 32($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) stq $17, 40($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) stq $17, 48($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) subq $3, 16, $2 # E : Repeat the loop at least once more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) stq $17, 56($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) addq $5, 64, $5 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) subq $3, 8, $3 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) bge $2, $do_wh64_b # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) beq $3, no_quad_b # U : Might have finished already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) * Simple loop for trailing quadwords, or for small amounts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) * of data (where we can't use an unrolled loop and wh64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) loop_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) stq $17,0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) subq $3,1,$3 # E : Decrement number quads left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) addq $5,8,$5 # E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) bne $3,loop_b # U : more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) no_quad_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) * Write 0..7 trailing bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) beq $18,end_b # U : All done?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) ldq $7,0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) mskqh $7,$6,$2 # U : Mask final quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) insqh $17,$6,$4 # U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) bis $2,$4,$1 # E : Put it all together
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) stq $1,0($5) # L : And back to memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) ret $31,($26),1 # L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) within_quad_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) ldq_u $1,0($16) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) insql $17,$16,$2 # U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) mskql $1,$16,$4 # U : Clear old
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) bis $2,$4,$2 # E : New result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) mskql $2,$6,$4 # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) mskqh $1,$6,$2 # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) bis $2,$4,$1 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) stq_u $1,0($16) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) end_b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) ret $31,($26),1 # L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) .end ___memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) EXPORT_SYMBOL(___memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) * This is the original body of code, prior to replication and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) * rescheduling. Leave it here, as there may be calls to this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) * entry point.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) .ent __constant_c_memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) __constant_c_memset:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) .frame $30,0,$26,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) .prologue 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) addq $18,$16,$6 # E : max address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) bis $16,$16,$0 # E : return value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) xor $16,$6,$1 # E : will complete write be within one quadword?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) ble $18,end # U : zero length requested?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) bic $1,7,$1 # E : fit within a single quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) beq $1,within_one_quad # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) and $16,7,$3 # E : Target addr misalignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) beq $3,aligned # U : target is 0mod8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) * Target address is misaligned, and won't fit within a quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) ldq_u $4,0($16) # L : Fetch first partial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) bis $16,$16,$5 # E : Save the address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) insql $17,$16,$2 # U : Insert new bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) subq $3,8,$3 # E : Invert (for addressing uses)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) addq $18,$3,$18 # E : $18 is new count ($3 is negative)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) mskql $4,$16,$4 # U : clear relevant parts of the quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) subq $16,$3,$16 # E : $16 is new aligned destination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) bis $2,$4,$1 # E : Final bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) stq_u $1,0($5) # L : Store result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) aligned:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) * We are now guaranteed to be quad aligned, with at least
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) * one partial quad to write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) sra $18,3,$3 # U : Number of remaining quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) and $18,7,$18 # E : Number of trailing bytes to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) bis $16,$16,$5 # E : Save dest address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) beq $3,no_quad # U : tail stuff only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) * it's worth the effort to unroll this and use wh64 if possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) * Lifted a bunch of code from clear_user.S
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) * At this point, entry values are:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) * $16 Current destination address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) * $5 A copy of $16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) * $6 The max quadword address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) * $18 Number trailer bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) * $3 Number quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) blt $4, loop # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) * We know we've got at least 16 quads, minimum of one trip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) * through unrolled loop. Do a quad at a time to get us 0mod64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) * aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) beq $1, $bigalign # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) $alignmod64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) stq $17, 0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) subq $3, 1, $3 # E : For consistency later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) addq $1, 8, $1 # E : Increment towards zero for alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) addq $5, 8, $5 # E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) blt $1, $alignmod64 # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) $bigalign:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) * $3 - number quads left to go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) * $5 - target address (aligned 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) * $17 - mask of stuff to store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) * Scratch registers available: $7, $2, $4, $1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) * we know that we'll be taking a minimum of one trip through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) * Assumes the wh64 needs to be for 2 trips through the loop in the future
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) * The wh64 is issued on for the starting destination address for trip +2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) * through the loop, and if there are less than two trips left, the target
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) * address will be for the current trip.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) $do_wh64:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) wh64 ($4) # L1 : memory subsystem write hint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) subq $3, 24, $2 # E : For determining future wh64 addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) stq $17, 0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) addq $5, 128, $4 # E : speculative target of next wh64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) stq $17, 8($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) stq $17, 16($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) stq $17, 24($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) stq $17, 32($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) stq $17, 40($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) stq $17, 48($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) subq $3, 16, $2 # E : Repeat the loop at least once more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) stq $17, 56($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) addq $5, 64, $5 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) subq $3, 8, $3 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) bge $2, $do_wh64 # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) beq $3, no_quad # U : Might have finished already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) * Simple loop for trailing quadwords, or for small amounts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) * of data (where we can't use an unrolled loop and wh64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) stq $17,0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) subq $3,1,$3 # E : Decrement number quads left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) addq $5,8,$5 # E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) bne $3,loop # U : more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) no_quad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) * Write 0..7 trailing bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) beq $18,end # U : All done?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) ldq $7,0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) mskqh $7,$6,$2 # U : Mask final quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) insqh $17,$6,$4 # U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) bis $2,$4,$1 # E : Put it all together
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) stq $1,0($5) # L : And back to memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) ret $31,($26),1 # L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) within_one_quad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) ldq_u $1,0($16) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) insql $17,$16,$2 # U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) mskql $1,$16,$4 # U : Clear old
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) bis $2,$4,$2 # E : New result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) mskql $2,$6,$4 # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) mskqh $1,$6,$2 # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) bis $2,$4,$1 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) stq_u $1,0($16) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) end:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) ret $31,($26),1 # L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) .end __constant_c_memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) EXPORT_SYMBOL(__constant_c_memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) * This is a replicant of the __constant_c_memset code, rescheduled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) * to mask stalls. Note that entry point names also had to change
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) .align 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) .ent __memset16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) __memset16:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) .frame $30,0,$26,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) .prologue 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) inswl $17,0,$5 # U : 000000000000c1c2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) inswl $17,2,$2 # U : 00000000c1c20000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) bis $16,$16,$0 # E : return value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) addq $18,$16,$6 # E : max address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) ble $18, end_w # U : zero length requested?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) inswl $17,4,$3 # U : 0000c1c200000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) inswl $17,6,$4 # U : c1c2000000000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) xor $16,$6,$1 # E : will complete write be within one quadword?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) or $2,$5,$2 # E : 00000000c1c2c1c2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) or $3,$4,$17 # E : c1c2c1c200000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) bic $1,7,$1 # E : fit within a single quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) and $16,7,$3 # E : Target addr misalignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) beq $1,within_quad_w # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) beq $3,aligned_w # U : target is 0mod8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) * Target address is misaligned, and won't fit within a quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) ldq_u $4,0($16) # L : Fetch first partial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) bis $16,$16,$5 # E : Save the address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) insql $17,$16,$2 # U : Insert new bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) subq $3,8,$3 # E : Invert (for addressing uses)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) addq $18,$3,$18 # E : $18 is new count ($3 is negative)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) mskql $4,$16,$4 # U : clear relevant parts of the quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) subq $16,$3,$16 # E : $16 is new aligned destination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) bis $2,$4,$1 # E : Final bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) stq_u $1,0($5) # L : Store result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) aligned_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) * We are now guaranteed to be quad aligned, with at least
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) * one partial quad to write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) sra $18,3,$3 # U : Number of remaining quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) and $18,7,$18 # E : Number of trailing bytes to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) bis $16,$16,$5 # E : Save dest address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) beq $3,no_quad_w # U : tail stuff only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) * it's worth the effort to unroll this and use wh64 if possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) * Lifted a bunch of code from clear_user.S
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) * At this point, entry values are:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) * $16 Current destination address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) * $5 A copy of $16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) * $6 The max quadword address to write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) * $18 Number trailer bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) * $3 Number quads to write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) blt $4, loop_w # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) * We know we've got at least 16 quads, minimum of one trip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) * through unrolled loop. Do a quad at a time to get us 0mod64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) * aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) beq $1, $bigalign_w # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) $alignmod64_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) stq $17, 0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) subq $3, 1, $3 # E : For consistency later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) addq $1, 8, $1 # E : Increment towards zero for alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) addq $5, 8, $5 # E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) blt $1, $alignmod64_w # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) $bigalign_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) * $3 - number quads left to go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) * $5 - target address (aligned 0mod64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) * $17 - mask of stuff to store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) * Scratch registers available: $7, $2, $4, $1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) * we know that we'll be taking a minimum of one trip through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) * Assumes the wh64 needs to be for 2 trips through the loop in the future
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) * The wh64 is issued on for the starting destination address for trip +2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) * through the loop, and if there are less than two trips left, the target
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) * address will be for the current trip.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) $do_wh64_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) wh64 ($4) # L1 : memory subsystem write hint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) subq $3, 24, $2 # E : For determining future wh64 addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) stq $17, 0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) addq $5, 128, $4 # E : speculative target of next wh64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) stq $17, 8($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) stq $17, 16($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) stq $17, 24($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) stq $17, 32($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) stq $17, 40($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) stq $17, 48($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) subq $3, 16, $2 # E : Repeat the loop at least once more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) stq $17, 56($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) addq $5, 64, $5 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) subq $3, 8, $3 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) bge $2, $do_wh64_w # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) beq $3, no_quad_w # U : Might have finished already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) * Simple loop for trailing quadwords, or for small amounts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) * of data (where we can't use an unrolled loop and wh64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) loop_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) stq $17,0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) subq $3,1,$3 # E : Decrement number quads left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) addq $5,8,$5 # E : Inc address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) bne $3,loop_w # U : more?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) no_quad_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) * Write 0..7 trailing bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) nop # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) beq $18,end_w # U : All done?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) ldq $7,0($5) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) mskqh $7,$6,$2 # U : Mask final quad
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) insqh $17,$6,$4 # U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) bis $2,$4,$1 # E : Put it all together
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) stq $1,0($5) # L : And back to memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) ret $31,($26),1 # L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) within_quad_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) ldq_u $1,0($16) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) insql $17,$16,$2 # U : New bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) mskql $1,$16,$4 # U : Clear old
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) bis $2,$4,$2 # E : New result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) mskql $2,$6,$4 # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) mskqh $1,$6,$2 # U :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) bis $2,$4,$1 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) stq_u $1,0($16) # L :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) end_w:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) ret $31,($26),1 # L0 :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) .end __memset16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) EXPORT_SYMBOL(__memset16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) memset = ___memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) __memset = ___memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) EXPORT_SYMBOL(memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) EXPORT_SYMBOL(__memset)