^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Optimized version of the copy_user() routine.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * It is used to copy date across the kernel/user boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * The source and destination are always on opposite side of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * the boundary. When reading from user space we must catch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * faults on loads. When writing to user space we must catch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * errors on stores. Note that because of the nature of the copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * we don't need to worry about overlapping regions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * Inputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * in0 address of source buffer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * in1 address of destination buffer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * in2 number of bytes to copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * Outputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * ret0 0 in case of success. The number of bytes NOT copied in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * case of error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * Copyright (C) 2000-2001 Hewlett-Packard Co
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * Stephane Eranian <eranian@hpl.hp.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * Fixme:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * - handle the case where we have more than 16 bytes and the alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * are different.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * - more benchmarking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * - fix extraneous stop bit introduced by the EX() macro.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <asm/asmmacro.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) // Tuneable parameters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #define COPY_BREAK 16 // we do byte copy below (must be >=16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #define PIPE_DEPTH 21 // pipe depth
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #define EPI p[PIPE_DEPTH-1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) // arguments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #define dst in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #define src in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #define len in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) // local registers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) #define t1 r2 // rshift in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #define t2 r3 // lshift in bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #define rshift r14 // right shift in bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) #define lshift r15 // left shift in bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) #define word1 r16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) #define word2 r17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) #define cnt r18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) #define len2 r19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) #define saved_lc r20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) #define saved_pr r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) #define tmp r22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) #define val r23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) #define src1 r24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) #define dst1 r25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) #define src2 r26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) #define dst2 r27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) #define len1 r28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) #define enddst r29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) #define endsrc r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) #define saved_pfs r31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) GLOBAL_ENTRY(__copy_user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) .prologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) .save ar.pfs, saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) .rotp p[PIPE_DEPTH]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) adds len2=-1,len // br.ctop is repeat/until
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) mov ret0=r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) ;; // RAW of cfm when len=0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) cmp.eq p8,p0=r0,len // check for zero length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) .save ar.lc, saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) mov saved_lc=ar.lc // preserve ar.lc (slow)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) (p8) br.ret.spnt.many rp // empty mempcy()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) add enddst=dst,len // first byte after end of source
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) add endsrc=src,len // first byte after end of destination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) .save pr, saved_pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) mov saved_pr=pr // preserve predicates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) .body
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) mov dst1=dst // copy because of rotation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) mov ar.ec=PIPE_DEPTH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) mov pr.rot=1<<16 // p16=true all others are false
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) mov src1=src // copy because of rotation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) mov ar.lc=len2 // initialize lc for small count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) xor tmp=src,dst // same alignment test prepare
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) (p10) br.cond.dptk .long_copy_user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) ;; // RAW pr.rot/p16 ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) // Now we do the byte by byte loop with software pipeline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) // p7 is necessarily false by now
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) br.ctop.dptk.few 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) mov ar.pfs=saved_pfs // restore ar.ec
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) br.ret.sptk.many rp // end of short memcpy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) // Not 8-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) .diff_align_copy_user:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) // At this point we know we have more than 16 bytes to copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) // and also that src and dest do _not_ have the same alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) and src2=0x7,src1 // src offset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) and dst2=0x7,dst1 // dst offset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) // The basic idea is that we copy byte-by-byte at the head so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) // that we can reach 8-byte alignment for both src1 and dst1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) // Then copy the body using software pipelined 8-byte copy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) // shifting the two back-to-back words right and left, then copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) // the tail by copying byte-by-byte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) // Fault handling. If the byte-by-byte at the head fails on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) // load, then restart and finish the pipleline by copying zeros
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) // to the dst1. Then copy zeros for the rest of dst1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) // If 8-byte software pipeline fails on the load, do the same as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) // failure_in3 does. If the byte-by-byte at the tail fails, it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) // handled simply by failure_in_pipe1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) // The case p14 represents the source has more bytes in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) // the first word (by the shifted part), whereas the p15 needs to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) // copy some bytes from the 2nd word of the source that has the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) // tail of the 1st of the destination.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) // to copy the head to dst1, to start 8-byte copy software pipeline.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) // We know src1 is not 8-byte aligned in this case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) cmp.eq p14,p15=r0,dst2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) (p15) br.cond.spnt 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) sub t1=8,src2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) mov t2=src2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) shl rshift=t2,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) sub len1=len,t1 // set len1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) sub lshift=64,rshift
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) br.cond.spnt .word_copy_user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) cmp.leu p14,p15=src2,dst2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) sub t1=dst2,src2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) .pred.rel "mutex", p14, p15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) (p14) sub word1=8,src2 // (8 - src offset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) (p15) sub t1=r0,t1 // absolute value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) (p15) sub word1=8,dst2 // (8 - dst offset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) // For the case p14, we don't need to copy the shifted part to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) // the 1st word of destination.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) sub t2=8,t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) (p14) sub word1=word1,t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) sub len1=len,word1 // resulting len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) (p15) shl rshift=t1,3 // in bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) (p14) shl rshift=t2,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) (p14) sub len1=len1,t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) adds cnt=-1,word1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) sub lshift=64,rshift
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) mov ar.ec=PIPE_DEPTH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) mov pr.rot=1<<16 // p16=true all others are false
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) mov ar.lc=cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) br.ctop.dptk.few 2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) clrrrb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) .word_copy_user:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) cmp.gtu p9,p0=16,len1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) (p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) shr.u cnt=len1,3 // number of 64-bit words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) adds cnt=-1,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) .pred.rel "mutex", p14, p15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) (p14) sub src1=src1,t2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) (p15) sub src1=src1,t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) // Now both src1 and dst1 point to an 8-byte aligned address. And
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) // we have more than 8 bytes to copy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) mov ar.lc=cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) mov ar.ec=PIPE_DEPTH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) mov pr.rot=1<<16 // p16=true all others are false
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) 3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) // The pipleline consists of 3 stages:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) // 1 (p16): Load a word from src1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) // 2 (EPI_1): Shift right pair, saving to tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) // 3 (EPI): Store tmp to dst1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) // To make it simple, use at least 2 (p16) loops to set up val1[n]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) // because we need 2 back-to-back val1[] to get tmp.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) // Note that this implies EPI_2 must be p18 or greater.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) #define EPI_1 p[PIPE_DEPTH-2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) #define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) #define CASE(pred, shift) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) (pred) br.cond.spnt .copy_user_bit##shift
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) #define BODY(rshift) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) .copy_user_bit##rshift: \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) 1: \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) EX(3f,(p16) ld8 val1[1]=[src1],8); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) (p16) mov val1[0]=r0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) br.ctop.dptk 1b; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) ;; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) br.cond.sptk.many .diff_align_do_tail; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) 2: \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) (EPI) st8 [dst1]=tmp,8; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) 3: \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) (p16) mov val1[1]=r0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) (p16) mov val1[0]=r0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) br.ctop.dptk 2b; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) ;; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) br.cond.sptk.many .failure_in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) // Since the instruction 'shrp' requires a fixed 128-bit value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) // specifying the bits to shift, we need to provide 7 cases
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) // below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) SWITCH(p6, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) SWITCH(p7, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) SWITCH(p8, 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) SWITCH(p9, 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) SWITCH(p10, 40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) SWITCH(p11, 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) SWITCH(p12, 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) CASE(p6, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) CASE(p7, 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) CASE(p8, 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) CASE(p9, 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) CASE(p10, 40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) CASE(p11, 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) CASE(p12, 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) BODY(8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) BODY(16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) BODY(24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) BODY(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) BODY(40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) BODY(48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) BODY(56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) .diff_align_do_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) .pred.rel "mutex", p14, p15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) (p14) sub src1=src1,t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) (p14) adds dst1=-8,dst1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) (p15) sub dst1=dst1,t1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) 4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) // Tail correction.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) // The problem with this piplelined loop is that the last word is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) // loaded and thus parf of the last word written is not correct.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) // To fix that, we simply copy the tail byte by byte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) sub len1=endsrc,src1,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) clrrrb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) mov ar.ec=PIPE_DEPTH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) mov pr.rot=1<<16 // p16=true all others are false
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) mov ar.lc=len1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 5:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) br.ctop.dptk.few 5b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) mov ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) // Beginning of long mempcy (i.e. > 16 bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) .long_copy_user:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) tbit.nz p6,p7=src1,0 // odd alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) and tmp=7,tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) cmp.eq p10,p8=r0,tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) mov len1=len // copy because of rotation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) (p8) br.cond.dpnt .diff_align_copy_user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) // At this point we know we have more than 16 bytes to copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) // and also that both src and dest have the same alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) // which may not be the one we want. So for now we must move
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) // forward slowly until we reach 16byte alignment: no need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) // worry about reaching the end of buffer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) (p6) adds len1=-1,len1;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) tbit.nz p7,p0=src1,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) (p7) adds len1=-2,len1;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) tbit.nz p8,p0=src1,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) // Stop bit not required after ld4 because if we fail on ld4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) // we have never executed the ld1, therefore st1 is not executed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) tbit.nz p9,p0=src1,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) // Stop bit not required after ld8 because if we fail on ld8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) // we have never executed the ld2, therefore st2 is not executed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) (p8) adds len1=-4,len1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) (p9) adds len1=-8,len1;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) tbit.nz p6,p0=len1,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) cmp.eq p7,p0=r0,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) adds tmp=-1,cnt // br.ctop is repeat/until
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) (p7) br.cond.dpnt .dotail // we have less than 16 bytes left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) adds src2=8,src1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) adds dst2=8,dst1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) mov ar.lc=tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) // 16bytes/iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) (p16) ld8 val2[0]=[src2],16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) br.ctop.dptk 2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) ;; // RAW on src1 when fall through from loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) // Tail correction based on len only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) // No matter where we come from (loop or test) the src1 pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) // is 16 byte aligned AND we have less than 16 bytes to copy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) .dotail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) tbit.nz p7,p0=len1,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) tbit.nz p8,p0=len1,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) tbit.nz p9,p0=len1,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) EX(.failure_out, (p8) st2 [dst1]=val2[0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) mov ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) EX(.failure_out, (p9) st1 [dst1]=val2[1])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) // Here we handle the case where the byte by byte copy fails
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) // on the load.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) // Several factors make the zeroing of the rest of the buffer kind of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) // tricky:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) // - the pipeline: loads/stores are not in sync (pipeline)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) // In the same loop iteration, the dst1 pointer does not directly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) // reflect where the faulty load was.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) // - pipeline effect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) // When you get a fault on load, you may have valid data from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) // previous loads not yet store in transit. Such data must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) // store normally before moving onto zeroing the rest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) // - single/multi dispersal independence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) // solution:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) // - we don't disrupt the pipeline, i.e. data in transit in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) // the software pipeline will be eventually move to memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) // We simply replace the load with a simple mov and keep the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) // pipeline going. We can't really do this inline because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) // p16 is always reset to 1 when lc > 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) .failure_in_pipe1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) (p16) mov val1[0]=r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) br.ctop.dptk 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) mov ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) // This is the case where the byte by byte copy fails on the load
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) // when we copy the head. We need to finish the pipeline and copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) // zeros for the rest of the destination. Since this happens
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) // at the top we still need to fill the body and tail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) .failure_in_pipe2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) (p16) mov val1[0]=r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) br.ctop.dptk 2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) sub len=enddst,dst1,1 // precompute len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) br.cond.dptk.many .failure_in1bis
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) // Here we handle the head & tail part when we check for alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) // The following code handles only the load failures. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) // main diffculty comes from the fact that loads/stores are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) // scheduled. So when you fail on a load, the stores corresponding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) // to previous successful loads must be executed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) // However some simplifications are possible given the way
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) // things work.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) // 1) HEAD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) // Theory of operation:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) // Page A | Page B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) // ---------|-----
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) // 1|8 x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) // 1 2|8 x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) // 4|8 x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) // 1 4|8 x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) // 2 4|8 x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) // 1 2 4|8 x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) // |1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) // |2 x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) // |4 x
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) // page_size >= 4k (2^12). (x means 4, 2, 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) // Here we suppose Page A exists and Page B does not.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) // As we move towards eight byte alignment we may encounter faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) // The numbers on each page show the size of the load (current alignment).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) // Key point:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) // - if you fail on 1, 2, 4 then you have never executed any smaller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) // size loads, e.g. failing ld4 means no ld1 nor ld2 executed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) // before.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) // This allows us to simplify the cleanup code, because basically you
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) // only have to worry about "pending" stores in the case of a failing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) // ld8(). Given the way the code is written today, this means only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) // worry about st2, st4. There we can use the information encapsulated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) // into the predicates.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) // Other key point:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) // - if you fail on the ld8 in the head, it means you went straight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) // to it, i.e. 8byte alignment within an unexisting page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) // Again this comes from the fact that if you crossed just for the ld8 then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) // you are 8byte aligned but also 16byte align, therefore you would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) // either go for the 16byte copy loop OR the ld8 in the tail part.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) // because it would mean you had 15bytes to copy in which case you
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) // would have defaulted to the byte by byte copy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) // 2) TAIL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) // aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) // Key point:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) // This means that we either:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) // - are right on a page boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) // OR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) // - are at more than 16 bytes from a page boundary with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) // at most 15 bytes to copy: no chance of crossing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) // This allows us to assume that if we fail on a load we haven't possibly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) // executed any of the previous (tail) ones, so we don't need to do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) // any stores. For instance, if we fail on ld2, this means we had
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) // This means that we are in a situation similar the a fault in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) // head part. That's nice!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) .failure_in1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) sub len=endsrc,src1,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) // we know that ret0 can never be zero at this point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) // because we failed why trying to do a load, i.e. there is still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) // some work to do.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) // The failure_in1bis and length problem is taken care of at the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) // calling side.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) .failure_in1bis: // from (.failure_in3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) mov ar.lc=len // Continue with a stupid byte store.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) 5:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) st1 [dst1]=r0,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) br.cloop.dptk 5b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) mov ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) // Here we simply restart the loop but instead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) // of doing loads we fill the pipeline with zeroes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) // We can't simply store r0 because we may have valid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) // data in transit in the pipeline.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) // ar.lc and ar.ec are setup correctly at this point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) // we MUST use src1/endsrc here and not dst1/enddst because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) // of the pipeline effect.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) .failure_in3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) (p16) mov val1[0]=r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) (p16) mov val2[0]=r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) br.ctop.dptk 2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) sub len=enddst,dst1,1 // precompute len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) (p6) br.cond.dptk .failure_in1bis
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) mov ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) .failure_in2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) sub ret0=endsrc,src1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) sub len=enddst,dst1,1 // precompute len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) (p6) br.cond.dptk .failure_in1bis
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) mov ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) // handling of failures on stores: that's the easy part
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) .failure_out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) sub ret0=enddst,dst1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) mov ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) END(__copy_user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) EXPORT_SYMBOL(__copy_user)