^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * arch/alpha/lib/ev67-strrchr.S
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Finds length of a 0-terminated string. Optimized for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Alpha architecture:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * - memory accessed as aligned quadwords only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * - uses bcmpge to compare 8 bytes in parallel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Much of the information about 21264 scheduling/coding comes from:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * Compiler Writer's Guide for the Alpha 21264
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * abbreviated as 'CWG' in other comments here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * Scheduling notation:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * E - either cluster
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <asm/regdef.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) .set noreorder
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) .set noat
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) .align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) .ent strrchr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) .globl strrchr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) strrchr:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) .frame sp, 0, ra
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) .prologue 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) and a1, 0xff, t2 # E : 00000000000000ch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) insbl a1, 1, t4 # U : 000000000000ch00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) insbl a1, 2, t5 # U : 0000000000ch0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) ldq_u t0, 0(a0) # L : load first quadword Latency=3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) mov zero, t6 # E : t6 is last match aligned addr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) or t2, t4, a1 # E : 000000000000chch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) sll t5, 8, t3 # U : 00000000ch000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) mov zero, t8 # E : t8 is last match byte compare mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) andnot a0, 7, v0 # E : align source addr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) or t5, t3, t3 # E : 00000000chch0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) sll a1, 32, t2 # U : 0000chch00000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) sll a1, 48, t4 # U : chch000000000000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) or t4, a1, a1 # E : chch00000000chch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) or t2, t3, t2 # E : 0000chchchch0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) or a1, t2, a1 # E : chchchchchchchch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) lda t5, -1 # E : build garbage mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) cmpbge zero, t0, t1 # E : bits set iff byte == zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) mskqh t5, a0, t4 # E : Complete garbage mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) xor t0, a1, t2 # E : make bytes == c zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) cmpbge zero, t4, t4 # E : bits set iff byte is garbage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) cmpbge zero, t2, t3 # E : bits set iff byte == c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) andnot t1, t4, t1 # E : clear garbage from null test
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) andnot t3, t4, t3 # E : clear garbage from char test
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) bne t1, $eos # U : did we already hit the terminator?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) /* Character search main loop */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) $loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) ldq t0, 8(v0) # L : load next quadword
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) cmovne t3, v0, t6 # E : save previous comparisons match
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) nop # : Latency=2, extra map slot (keep nop with cmov)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) cmovne t3, t3, t8 # E : Latency=2, extra map slot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) nop # : keep with cmovne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) addq v0, 8, v0 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) xor t0, a1, t2 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) cmpbge zero, t0, t1 # E : bits set iff byte == zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) cmpbge zero, t2, t3 # E : bits set iff byte == c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) beq t1, $loop # U : if we havnt seen a null, loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) /* Mask out character matches after terminator */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) $eos:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) negq t1, t4 # E : isolate first null byte match
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) and t1, t4, t4 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) subq t4, 1, t5 # E : build a mask of the bytes up to...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) or t4, t5, t4 # E : ... and including the null
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) and t3, t4, t3 # E : mask out char matches after null
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) cmovne t3, t3, t8 # E : save it, if match found Latency=2, extra map slot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) nop # : Keep with cmovne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) cmovne t3, v0, t6 # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) nop # : Keep with cmovne
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) /* Locate the address of the last matched character */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) ctlz t8, t2 # U0 : Latency=3 (0x40 for t8=0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) cmoveq t8, 0x3f, t2 # E : Compensate for case when no match is seen
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) nop # E : hide the cmov latency (2) behind ctlz latency
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) lda t5, 0x3f($31) # E :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) subq t5, t2, t5 # E : Normalize leading zero count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) addq t6, t5, v0 # E : and add to quadword address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) ret # L0 : Latency=3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) nop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) .end strrchr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) EXPORT_SYMBOL(strrchr)