^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Optimized version of the standard strlen() function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Inputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * in0 address of string
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * Outputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * ret0 the number of characters in the string (0 if empty string)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * does not count the \0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * Copyright (C) 1999, 2001 Hewlett-Packard Co
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * Stephane Eranian <eranian@hpl.hp.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * 09/24/99 S.Eranian add speculation recovery code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <asm/asmmacro.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) // This is an enhanced version of the basic strlen. it includes a combination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) // of compute zero index (czx), parallel comparisons, speculative loads and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) // loop unroll using rotating registers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) // General Ideas about the algorithm:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) // The goal is to look at the string in chunks of 8 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) // so we need to do a few extra checks at the beginning because the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) // string may not be 8-byte aligned. In this case we load the 8byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) // quantity which includes the start of the string and mask the unused
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) // bytes with 0xff to avoid confusing czx.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) // We use speculative loads and software pipelining to hide memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) // latency and do read ahead safely. This way we defer any exception.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) // Because we don't want the kernel to be relying on particular
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) // settings of the DCR register, we provide recovery code in case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) // speculation fails. The recovery code is going to "redo" the work using
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) // only normal loads. If we still get a fault then we generate a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) // kernel panic. Otherwise we return the strlen as usual.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) // The fact that speculation may fail can be caused, for instance, by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) // the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) // a NaT bit will be set if the translation is not present. The normal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) // load, on the other hand, will cause the translation to be inserted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) // if the mapping exists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) // It should be noted that we execute recovery code only when we need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) // to use the data that has been speculatively loaded: we don't execute
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) // recovery code on pure read ahead data.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) // Remarks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) // - the cmp r0,r0 is used as a fast way to initialize a predicate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) // register to 1. This is required to make sure that we get the parallel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) // compare correct.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) // - we don't use the epilogue counter to exit the loop but we need to set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) // it to zero beforehand.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) // - after the loop we must test for Nat values because neither the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) // czx nor cmp instruction raise a NaT consumption fault. We must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) // careful not to look too far for a Nat for which we don't care.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) // For instance we don't need to look at a NaT in val2 if the zero byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) // was in val1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) // - Clearly performance tuning is required.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) #define saved_pfs r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) #define tmp r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) #define base r16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) #define orig r17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) #define saved_pr r18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) #define src r19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) #define mask r20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) #define val r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) #define val1 r22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) #define val2 r23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) GLOBAL_ENTRY(strlen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) .prologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) .save ar.pfs, saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) .rotr v[2], w[2] // declares our 4 aliases
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) extr.u tmp=in0,0,3 // tmp=least significant 3 bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) mov orig=in0 // keep trackof initial byte address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) dep src=0,in0,0,3 // src=8byte-aligned in0 address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) .save pr, saved_pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) mov saved_pr=pr // preserve predicates (rotation)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) .body
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) ld8 v[1]=[src],8 // must not speculate: can fail here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) shl tmp=tmp,3 // multiply by 8bits/byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) mov mask=-1 // our mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) ld8.s w[1]=[src],8 // speculatively load next
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) sub tmp=64,tmp // how many bits to shift our mask on the right
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) add base=-16,src // keep track of aligned base
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) or v[1]=v[1],mask // now we have a safe initial byte pattern
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) ld8.s v[0]=[src],8 // speculatively load next
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) czx1.r val1=v[1] // search 0 byte from right
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) czx1.r val2=w[1] // search 0 byte from right following 8bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) ld8.s w[0]=[src],8 // speculatively load next to next
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) (p6) br.wtop.dptk 1b // loop until p6 == 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) // We must return try the recovery code iff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) // val1_is_nat || (val1==8 && val2_is_nat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) // XXX Fixme
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) // - there must be a better way of doing the test
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) tnat.nz p6,p7=val1 // test NaT on val1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) (p6) br.cond.spnt .recover // jump to recovery if val1 is NaT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) // if we come here p7 is true, i.e., initialized for // cmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) cmp.eq.and p7,p0=8,val1// val1==8?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) tnat.nz.and p7,p0=val2 // test NaT if val2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) (p7) br.cond.spnt .recover // jump to recovery if val2 is NaT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) (p8) mov val1=val2 // the other test got us out of the loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) (p8) adds src=-16,src // correct position when 3 ahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) (p9) adds src=-24,src // correct position when 4 ahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) sub ret0=src,orig // distance from base
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) sub tmp=8,val1 // which byte in word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) sub ret0=ret0,tmp // adjust
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) br.ret.sptk.many rp // end of normal execution
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) // Outlined recovery code when speculation failed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) // This time we don't use speculation and rely on the normal exception
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) // mechanism. that's why the loop is not as good as the previous one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) // because read ahead is not possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) // IMPORTANT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) // Please note that in the case of strlen() as opposed to strlen_user()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) // we don't use the exception mechanism, as this function is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) // supposed to fail. If that happens it means we have a bug and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) // code will cause of kernel fault.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) // XXX Fixme
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) // - today we restart from the beginning of the string instead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) // of trying to continue where we left off.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) .recover:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) ld8 val=[base],8 // will fail if unrecoverable fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) or val=val,mask // remask first bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) // ar.ec is still zero here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) (p6) ld8 val=[base],8 // will fail if unrecoverable fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) czx1.r val1=val // search 0 byte from right
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) cmp.eq p6,p0=8,val1 // val1==8 ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) (p6) br.wtop.dptk 2b // loop until p6 == 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) ;; // (avoid WAW on p63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) sub ret0=base,orig // distance from base
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) sub tmp=8,val1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) sub ret0=ret0,tmp // length=now - back -1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) br.ret.sptk.many rp // end of successful recovery code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) END(strlen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) EXPORT_SYMBOL(strlen)