^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Itanium 2-optimized version of memcpy and copy_user function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Inputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * in0: destination address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * in1: source address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * in2: number of bytes to copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * Output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * for memcpy: return dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * for copy_user: return 0 if success,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * or number of byte NOT copied if error occurred.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * Copyright (C) 2002 Intel Corp.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <asm/asmmacro.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <asm/page.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #define EK(y...) EX(y)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) /* McKinley specific optimization */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #define retval r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #define saved_pfs r31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #define saved_lc r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #define saved_pr r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #define saved_in0 r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #define saved_in1 r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #define saved_in2 r16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #define src0 r2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #define src1 r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #define dst0 r17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #define dst1 r18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #define cnt r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) /* r19-r30 are temp for each code section */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #define PREFETCH_DIST 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #define src_pre_mem r19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #define dst_pre_mem r20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #define src_pre_l2 r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #define dst_pre_l2 r22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #define t1 r23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #define t2 r24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #define t3 r25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #define t4 r26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #define t5 t1 // alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #define t6 t2 // alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #define t7 t3 // alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #define n8 r27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #define t9 t5 // alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) #define t10 t4 // alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #define t11 t7 // alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #define t12 t6 // alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) #define t14 t10 // alias!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) #define t13 r28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) #define t15 r29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) #define tmp r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) /* defines for long_copy block */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) #define A 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) #define B (PREFETCH_DIST)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) #define C (B + PREFETCH_DIST)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) #define D (C + 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) #define N (D + 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) #define Nrot ((N + 7) & ~7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) /* alias */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) #define in0 r32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) #define in1 r33
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) #define in2 r34
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) GLOBAL_ENTRY(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) and r28=0x7,in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) and r29=0x7,in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) mov f6=f0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) mov retval=in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) br.cond.sptk .common_code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) END(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) EXPORT_SYMBOL(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) GLOBAL_ENTRY(__copy_user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) .prologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) // check dest alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) and r28=0x7,in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) and r29=0x7,in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) mov f6=f1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) mov saved_in0=in0 // save dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) mov saved_in1=in1 // save src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) mov retval=r0 // initialize return value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) .common_code:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) cmp.gt p15,p0=8,in2 // check for small size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) cmp.ne p13,p0=0,r28 // check dest alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) cmp.ne p14,p0=0,r29 // check src alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) add src0=0,in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) sub r30=8,r28 // for .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) mov saved_in2=in2 // save len
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) add dst0=0,in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) add dst1=1,in0 // dest odd index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) cmp.le p6,p0 = 1,r30 // for .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) (p15) br.cond.dpnt .memcpy_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) (p13) br.cond.dpnt .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) (p14) br.cond.dpnt .unaligned_src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) // both dest and src are aligned on 8-byte boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) .aligned_src:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) .save ar.pfs, saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) .save pr, saved_pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) mov saved_pr=pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) shr.u cnt=in2,7 // this much cache line
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) cmp.lt p6,p0=2*PREFETCH_DIST,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) cmp.lt p7,p8=1,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) .save ar.lc, saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) mov saved_lc=ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) .body
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) add cnt=-1,cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) add src_pre_mem=0,in1 // prefetch src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) add dst_pre_mem=0,in0 // prefetch dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) (p7) mov ar.lc=cnt // prefetch count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) (p8) mov ar.lc=r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) (p6) br.cond.dpnt .long_copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) .prefetch:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) lfetch.fault [src_pre_mem], 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) lfetch.fault.excl [dst_pre_mem], 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) br.cloop.dptk.few .prefetch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) .medium_copy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) and tmp=31,in2 // copy length after iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) shr.u r29=in2,5 // number of 32-byte iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) add dst1=8,dst0 // 2nd dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) add cnt=-1,r29 // ctop iteration adjustment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) cmp.eq p10,p0=r29,r0 // do we really need to loop?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) add src1=8,src0 // 2nd src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) cmp.le p6,p0=8,tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) cmp.le p7,p0=16,tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) mov ar.lc=cnt // loop setup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) cmp.eq p16,p17 = r0,r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) mov ar.ec=2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) (p10) br.dpnt.few .aligned_src_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) TEXT_ALIGN(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) EX(.ex_handler, (p16) ld8 r34=[src0],16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) EK(.ex_handler, (p16) ld8 r38=[src1],16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) EX(.ex_handler, (p17) st8 [dst0]=r33,16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) EK(.ex_handler, (p17) st8 [dst1]=r37,16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) EX(.ex_handler, (p16) ld8 r32=[src0],16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) EK(.ex_handler, (p16) ld8 r36=[src1],16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) EX(.ex_handler, (p16) st8 [dst0]=r34,16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) EK(.ex_handler, (p16) st8 [dst1]=r38,16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) br.ctop.dptk.few 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) .aligned_src_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) EX(.ex_handler, (p6) ld8 t1=[src0])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) mov ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) cmp.le p8,p0=24,tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) and r21=-8,tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) EX(.ex_hndlr_s, (p8) ld8 t3=[src1])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) and in2=7,tmp // remaining length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) add src0=src0,r21 // setting up src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) add dst0=dst0,r21 // setting up dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) mov pr=saved_pr,-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) br.dptk.many .memcpy_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) /* code taken from copy_page_mck */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) .long_copy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) .rotr v[2*PREFETCH_DIST]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) .rotp p[N]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) mov src_pre_mem = src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) mov pr.rot = 0x10000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) mov ar.ec = 1 // special unrolled loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) mov dst_pre_mem = dst0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) add src_pre_l2 = 8*8, src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) add dst_pre_l2 = 8*8, dst0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) add src0 = 8, src_pre_mem // first t1 src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) mov ar.lc = 2*PREFETCH_DIST - 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) shr.u cnt=in2,7 // number of lines
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) add src1 = 3*8, src_pre_mem // first t3 src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) add dst0 = 8, dst_pre_mem // first t1 dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) add dst1 = 3*8, dst_pre_mem // first t3 dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) and tmp=127,in2 // remaining bytes after this block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) add cnt = -(2*PREFETCH_DIST) - 1, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) // same as .line_copy loop, but with all predicated-off instructions removed:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) .prefetch_loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) br.ctop.sptk .prefetch_loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) cmp.eq p16, p0 = r0, r0 // reset p16 to 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) mov ar.lc = cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) mov ar.ec = N // # of stages in pipeline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) .line_copy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) br.ctop.sptk .line_copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) add dst0=-8,dst0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) add src0=-8,src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) mov in2=tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) .restore sp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) br.sptk.many .medium_copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) #define BLOCK_SIZE 128*32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) #define blocksize r23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) #define curlen r24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) // dest is on 8-byte boundary, src is not. We need to do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) // ld8-ld8, shrp, then st8. Max 8 byte copy per cycle.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) .unaligned_src:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) .prologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) .save ar.pfs, saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) alloc saved_pfs=ar.pfs,3,5,0,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) .save ar.lc, saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) mov saved_lc=ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) .save pr, saved_pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) mov saved_pr=pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) .body
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) .4k_block:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) mov saved_in0=dst0 // need to save all input arguments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) mov saved_in2=in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) mov blocksize=BLOCK_SIZE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) cmp.lt p6,p7=blocksize,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) mov saved_in1=src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) (p6) mov in2=blocksize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) shr.u r21=in2,7 // this much cache line
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) shr.u r22=in2,4 // number of 16-byte iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) and curlen=15,in2 // copy length after iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) and r30=7,src0 // source alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) cmp.lt p7,p8=1,r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) add cnt=-1,r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) add src_pre_mem=0,src0 // prefetch src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) add dst_pre_mem=0,dst0 // prefetch dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) and src0=-8,src0 // 1st src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) (p7) mov ar.lc = cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) (p8) mov ar.lc = r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) TEXT_ALIGN(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) 1: lfetch.fault [src_pre_mem], 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) lfetch.fault.excl [dst_pre_mem], 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) br.cloop.dptk.few 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) shladd dst1=r22,3,dst0 // 2nd dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) shladd src1=r22,3,src0 // 2nd src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) cmp.eq p8,p9=r22,r0 // do we really need to loop?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) cmp.le p6,p7=8,curlen; // have at least 8 byte remaining?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) add cnt=-1,r22 // ctop iteration adjustment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) EK(.ex_handler, (p9) ld8 r37=[src1],8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) (p8) br.dpnt.few .noloop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) // The jump address is calculated based on src alignment. The COPYU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) // macro below need to confine its size to power of two, so an entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) // can be caulated using shl instead of an expensive multiply. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) // size is then hard coded by the following #define to match the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) // actual size. This make it somewhat tedious when COPYU macro gets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) // changed and this need to be adjusted to match.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) #define LOOP_SIZE 6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) mov r29=ip // jmp_table thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) mov ar.lc=cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) add r29=.jump_table - 1b - (.jmp1-.jump_table), r29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) shl r28=r30, LOOP_SIZE // jmp_table thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) mov ar.ec=2 // loop setup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) add r29=r29,r28 // jmp_table thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) cmp.eq p16,p17=r0,r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) mov b6=r29 // jmp_table thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) br.cond.sptk.few b6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) // for 8-15 byte case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) // We will skip the loop, but need to replicate the side effect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) // that the loop produces.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) .noloop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) EX(.ex_handler, (p6) ld8 r37=[src1],8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) add src0=8,src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) (p6) shl r25=r30,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) EX(.ex_handler, (p6) ld8 r27=[src1])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) (p6) shr.u r28=r37,r25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) (p6) sub r26=64,r25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) (p6) shl r27=r27,r26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) (p6) or r21=r28,r27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) .unaligned_src_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) /* check if we have more than blocksize to copy, if so go back */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) cmp.gt p8,p0=saved_in2,blocksize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) (p8) add dst0=saved_in0,blocksize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) (p8) add src0=saved_in1,blocksize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) (p8) sub in2=saved_in2,blocksize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) (p8) br.dpnt .4k_block
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) /* we have up to 15 byte to copy in the tail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) * part of work is already done in the jump table code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) * we are at the following state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) * src side:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) * xxxxxx xx <----- r21 has xxxxxxxx already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) * -------- -------- --------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) * 0 8 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) * ^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) * |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) * src1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) * dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) * -------- -------- --------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) * ^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) * |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * dst1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) (p6) add curlen=-8,curlen // update length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) mov ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) mov pr=saved_pr,-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) mov in2=curlen // remaining length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) mov dst0=dst1 // dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) add src0=src1,r30 // forward by src alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) // 7 byte or smaller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) .memcpy_short:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) cmp.le p8,p9 = 1,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) cmp.le p10,p11 = 2,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) cmp.le p12,p13 = 3,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) cmp.le p14,p15 = 4,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) add src1=1,src0 // second src pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) add dst1=1,dst0 // second dest pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) EX(.ex_handler_short, (p8) ld1 t1=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) EK(.ex_handler_short, (p10) ld1 t2=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) (p9) br.ret.dpnt rp // 0 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) (p11) br.ret.dpnt rp // 1 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) EX(.ex_handler_short, (p12) ld1 t3=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) EK(.ex_handler_short, (p14) ld1 t4=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) (p13) br.ret.dpnt rp // 2 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) cmp.le p6,p7 = 5,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) cmp.le p8,p9 = 6,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) cmp.le p10,p11 = 7,in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) (p15) br.ret.dpnt rp // 3 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) EX(.ex_handler_short, (p6) ld1 t5=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) EK(.ex_handler_short, (p8) ld1 t6=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) (p7) br.ret.dpnt rp // 4 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) (p9) br.ret.dptk rp // 5 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) EX(.ex_handler_short, (p10) ld1 t7=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) (p11) br.ret.dptk rp // 6 byte copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) EX(.ex_handler_short, (p10) st1 [dst0]=t7,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) br.ret.dptk rp // done all cases
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) /* Align dest to nearest 8-byte boundary. We know we have at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) * least 7 bytes to copy, enough to crawl to 8-byte boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) * Actual number of byte to crawl depend on the dest alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) * 7 byte or less is taken care at .memcpy_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) * src0 - source even index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) * src1 - source odd index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) * dst0 - dest even index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) * dst1 - dest odd index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) * r30 - distance to 8-byte boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) .align_dest:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) add src1=1,in1 // source odd index
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) cmp.le p7,p0 = 2,r30 // for .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) cmp.le p8,p0 = 3,r30 // for .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) EX(.ex_handler_short, (p6) ld1 t1=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) cmp.le p9,p0 = 4,r30 // for .align_dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) cmp.le p10,p0 = 5,r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) EX(.ex_handler_short, (p7) ld1 t2=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) EK(.ex_handler_short, (p8) ld1 t3=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) cmp.le p11,p0 = 6,r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) EX(.ex_handler_short, (p6) st1 [dst0] = t1,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) cmp.le p12,p0 = 7,r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) EX(.ex_handler_short, (p9) ld1 t4=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) EK(.ex_handler_short, (p10) ld1 t5=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) EK(.ex_handler_short, (p8) st1 [dst0] = t3,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) EX(.ex_handler_short, (p11) ld1 t6=[src1],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) EK(.ex_handler_short, (p12) ld1 t7=[src0],2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) cmp.eq p6,p7=r28,r29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) EK(.ex_handler_short, (p10) st1 [dst0] = t5,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) sub in2=in2,r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) EK(.ex_handler_short, (p12) st1 [dst0] = t7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) add dst0=in0,r30 // setup arguments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) add src0=in1,r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) (p6) br.cond.dptk .aligned_src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) (p7) br.cond.dpnt .unaligned_src
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) /* main loop body in jump table format */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) #define COPYU(shift) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) 1: \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) EK(.ex_handler, (p16) ld8 r36=[src1],8); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) (p17) shrp r35=r33,r34,shift;; /* 1 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) nop.m 0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) (p16) shrp r38=r36,r37,shift; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) EK(.ex_handler, (p17) st8 [dst1]=r39,8); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) br.ctop.dptk.few 1b;; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) (p7) add src1=-8,src1; /* back out for <8 byte case */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) shrp r21=r22,r38,shift; /* speculative work */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) br.sptk.few .unaligned_src_tail /* branch out of jump table */ \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) TEXT_ALIGN(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) .jump_table:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) COPYU(8) // unaligned cases
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) .jmp1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) COPYU(16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) COPYU(24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) COPYU(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) COPYU(40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) COPYU(48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) COPYU(56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) #undef A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) #undef B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) #undef C
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) #undef D
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) * instruction failed in the bundle. The exception algorithm is that we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) * first figure out the faulting address, then detect if there is any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) * progress made on the copy, if so, redo the copy from last known copied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) * location up to the faulting address (exclusive). In the copy_from_user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) * case, remaining byte in kernel buffer will be zeroed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) * Take copy_from_user as an example, in the code there are multiple loads
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) * in a bundle and those multiple loads could span over two pages, the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) * faulting address is calculated as page_round_down(max(src0, src1)).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) * This is based on knowledge that if we can access one byte in a page, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) * can access any byte in that page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) * predicate used in the exception handler:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) * p6-p7: direction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) * p10-p11: src faulting addr calculation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) * p12-p13: dst faulting addr calculation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) #define A r19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) #define B r20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) #define C r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) #define D r22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) #define F r28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) #define saved_retval loc0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) #define saved_rtlink loc1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) #define saved_pfs_stack loc2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) .ex_hndlr_s:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) add src0=8,src0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) br.sptk .ex_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) .ex_hndlr_d:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) add dst0=8,dst0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) br.sptk .ex_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) .ex_hndlr_lcpy_1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) mov src1=src_pre_mem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) mov dst1=dst_pre_mem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) cmp.gtu p10,p11=src_pre_mem,saved_in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) cmp.gtu p12,p13=dst_pre_mem,saved_in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) (p10) add src0=8,saved_in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) (p11) mov src0=saved_in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) (p12) add dst0=8,saved_in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) (p13) mov dst0=saved_in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) br.sptk .ex_handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) .ex_handler_lcpy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) // in line_copy block, the preload addresses should always ahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) // of the other two src/dst pointers. Furthermore, src1/dst1 should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) // always ahead of src0/dst0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) mov src1=src_pre_mem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) mov dst1=dst_pre_mem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) .ex_handler:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) mov pr=saved_pr,-1 // first restore pr, lc, and pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) mov ar.pfs=saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) .ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) cmp.ltu p10,p11=src0,src1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) cmp.ltu p12,p13=dst0,dst1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) fcmp.eq p8,p0=f6,f0 // is it memcpy?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) mov tmp = dst0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) (p11) mov src1 = src0 // pick the larger of the two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) (p13) mov dst0 = dst1 // make dst0 the smaller one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) (p13) mov dst1 = tmp // and dst1 the larger one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) (p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) (p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) (p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) (p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) mov retval=saved_in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) (p8) ld1 tmp=[src1] // force an oops for memcpy call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) (p8) st1 [dst1]=r0 // force an oops for memcpy call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) (p14) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) * The remaining byte to copy is calculated as:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) * A = (faulting_addr - orig_src) -> len to faulting ld address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) * or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) * (faulting_addr - orig_dst) -> len to faulting st address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) * B = (cur_dst - orig_dst) -> len copied so far
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) * C = A - B -> len need to be copied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) * D = orig_len - A -> len need to be left along
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) (p6) sub A = F, saved_in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) (p7) sub A = F, saved_in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) clrrrb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) alloc saved_pfs_stack=ar.pfs,3,3,3,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) cmp.lt p8,p0=A,r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) sub B = dst0, saved_in0 // how many byte copied so far
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) (p8) mov A = 0; // A shouldn't be negative, cap it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) sub C = A, B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) sub D = saved_in2, A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) cmp.gt p8,p0=C,r0 // more than 1 byte?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) mov r8=0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) mov saved_retval = D
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) mov saved_rtlink = b0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) add out0=saved_in0, B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) add out1=saved_in1, B
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) mov out2=C
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) (p8) br.call.sptk.few b0=__copy_user // recursive call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) add saved_retval=saved_retval,r8 // above might return non-zero value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) mov retval=saved_retval
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) mov ar.pfs=saved_pfs_stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) mov b0=saved_rtlink
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) /* end of McKinley specific optimization */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) END(__copy_user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) EXPORT_SYMBOL(__copy_user)