^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * This routine clears to zero a linear memory buffer in user space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Inputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * in0: address of buffer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * in1: length of buffer in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Outputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * r8: number of bytes that didn't get cleared due to a fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Stephane Eranian <eranian@hpl.hp.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <asm/asmmacro.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) // arguments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #define buf r32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #define len r33
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) // local registers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #define cnt r16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #define buf2 r17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #define saved_lc r18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #define saved_pfs r19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #define tmp r20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #define len2 r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #define len3 r22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) // Theory of operations:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) // - we check whether or not the buffer is small, i.e., less than 17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) // in which case we do the byte by byte loop.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) // - Otherwise we go progressively from 1 byte store to 8byte store in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) // the head part, the body is a 16byte store loop and we finish we the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) // tail for the last 15 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) // The good point about this breakdown is that the long buffer handling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) // contains only 2 branches.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) // The reason for not using shifting & masking for both the head and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) // tail is to stay semantically correct. This routine is not supposed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) // to write bytes outside of the buffer. While most of the time this would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) // be ok, we can't tolerate a mistake. A classical example is the case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) // of multithreaded code were to the extra bytes touched is actually owned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) // by another thread which runs concurrently to ours. Another, less likely,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) // example is with device drivers where reading an I/O mapped location may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) // have side effects (same thing for writing).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) GLOBAL_ENTRY(__do_clear_user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) .prologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) .save ar.pfs, saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) alloc saved_pfs=ar.pfs,2,0,0,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) cmp.eq p6,p0=r0,len // check for zero length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) .save ar.lc, saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) mov saved_lc=ar.lc // preserve ar.lc (slow)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) .body
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) ;; // avoid WAW on CFM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) adds tmp=-1,len // br.ctop is repeat/until
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) mov ret0=len // return value is length at this point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) (p6) br.ret.spnt.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) cmp.lt p6,p0=16,len // if len > 16 then long memset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) mov ar.lc=tmp // initialize lc for small count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) (p6) br.cond.dptk .long_do_clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) ;; // WAR on ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) // worst case 16 iterations, avg 8 iterations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) // We could have played with the predicates to use the extra
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) // M slot for 2 stores/iteration but the cost the initialization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) // the various counters compared to how long the loop is supposed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) // to last on average does not make this solution viable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) EX( .Lexit1, st1 [buf]=r0,1 )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) adds len=-1,len // countdown length using len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) br.cloop.dptk 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) ;; // avoid RAW on ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) // .Lexit4: comes from byte by byte loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) // len contains bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) .Lexit1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) mov ret0=len // faster than using ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) br.ret.sptk.many rp // end of short clear_user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) // At this point we know we have more than 16 bytes to copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) // so we focus on alignment (no branches required)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) // The use of len/len2 for countdown of the number of bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) // instead of ret0 is due to the fact that the exception code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) // changes the values of r8.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) .long_do_clear:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) (p6) adds len=-1,len;; // sync because buf is modified
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) tbit.nz p6,p0=buf,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) (p6) adds len=-2,len;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) tbit.nz p6,p0=buf,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) (p6) adds len=-4,len;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) tbit.nz p6,p0=buf,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) (p6) adds len=-8,len;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) shr.u cnt=len,4 // number of 128-bit (2x64bit) words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) cmp.eq p6,p0=r0,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) adds tmp=-1,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) (p6) br.cond.dpnt .dotail // we have less than 16 bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) adds buf2=8,buf // setup second base pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) mov ar.lc=tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) // 16bytes/iteration core loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) // The second store can never generate a fault because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) // we come into the loop only when we are 16-byte aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) // This means that if we cross a page then it will always be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) // in the first store and never in the second.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) // We need to keep track of the remaining length. A possible (optimistic)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) // way would be to use ar.lc and derive how many byte were left by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) // doing : left= 16*ar.lc + 16. this would avoid the addition at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) // every iteration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) // However we need to keep the synchronization point. A template
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) // M;;MB does not exist and thus we can keep the addition at no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) // extra cycle cost (use a nop slot anyway). It also simplifies the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) // (unlikely) error recovery code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 2: EX(.Lexit3, st8 [buf]=r0,16 )
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) ;; // needed to get len correct when error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) st8 [buf2]=r0,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) adds len=-16,len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) br.cloop.dptk 2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) // tail correction based on len only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) // We alternate the use of len3,len2 to allow parallelism and correct
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) // error handling. We also reuse p6/p7 to return correct value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) // The addition of len2/len3 does not cost anything more compared to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) // the regular memset as we had empty slots.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) .dotail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) mov len2=len // for parallelization of error handling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) mov len3=len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) tbit.nz p6,p0=len,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) (p6) adds len3=-8,len2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) tbit.nz p7,p6=len,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) (p7) adds len2=-4,len3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) tbit.nz p6,p7=len,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) (p6) adds len3=-2,len2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) tbit.nz p7,p6=len,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) mov ret0=r0 // success
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) br.ret.sptk.many rp // end of most likely path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) // Outlined error handling code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) // .Lexit3: comes from core loop, need restore pr/lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) // len contains bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) // .Lexit2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) // if p6 -> coming from st8 or st2 : len2 contains what's left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) // if p7 -> coming from st4 or st1 : len3 contains what's left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) // We must restore lc/pr even though might not have been used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) .Lexit2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) .pred.rel "mutex", p6, p7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) (p6) mov len=len2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) (p7) mov len=len3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) // .Lexit4: comes from head, need not restore pr/lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) // len contains bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) .Lexit3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) mov ret0=len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) END(__do_clear_user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) EXPORT_SYMBOL(__do_clear_user)