^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /* Optimized version of the standard memset() function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) Copyright (c) 2002 Hewlett-Packard Co/CERN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) Sverre Jarp <Sverre.Jarp@cern.ch>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) Return: dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) Inputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) in0: dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) in1: value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) in2: count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) The algorithm is fairly straightforward: set byte by byte until we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) we get to a 16B-aligned address, then loop on 128 B chunks using an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) early store as prefetching, then loop on 32B chucks, then clear remaining
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) words, finally clear remaining bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) Since a stf.spill f0 can store 16B in one go, we use this instruction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) to get peak speed when value = 0. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <asm/asmmacro.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #undef ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #define dest in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #define value in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #define cnt in2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #define tmp r31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #define save_lc r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #define ptr0 r29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #define ptr1 r28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #define ptr2 r27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #define ptr3 r26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #define ptr9 r24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #define loopcnt r23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #define linecnt r22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #define bytecnt r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #define fvalue f6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) // This routine uses only scratch predicate registers (p6 - p15)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #define p_scr p6 // default register for same-cycle branches
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #define p_nz p7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #define p_zr p8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #define p_unalgn p9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #define p_y p11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #define p_n p12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #define p_yy p13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #define p_nn p14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #define MIN1 15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #define MIN1P1HALF 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) #define LINE_SIZE 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #define LSIZE_SH 7 // shift amount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #define PREF_AHEAD 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) GLOBAL_ENTRY(memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) .prologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) alloc tmp = ar.pfs, 3, 0, 0, 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) lfetch.nt1 [dest] //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) .save ar.lc, save_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) mov.i save_lc = ar.lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) .body
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) } { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) mov ret0 = dest // return value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) cmp.eq p_scr, p0 = cnt, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) and ptr2 = -(MIN1+1), dest // aligned address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) and tmp = MIN1, dest // prepare to check for correct alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) } { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) mov ptr1 = dest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) mux1 value = value, @brcst // create 8 identical bytes in word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) (p_scr) br.ret.dpnt.many rp // return immediately if count = 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) cmp.ne p_unalgn, p0 = tmp, r0 //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) } { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) (p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) (p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) (p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) (p_y) add cnt = -8, cnt //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) } { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) (p_y) st8 [ptr2] = value,-4 //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) (p_n) add ptr2 = 4, ptr2 //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) (p_yy) add cnt = -4, cnt //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) } { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) (p_yy) st4 [ptr2] = value,-2 //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) (p_nn) add ptr2 = 2, ptr2 //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) mov tmp = LINE_SIZE+1 // for compare
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) (p_y) add cnt = -2, cnt //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) } { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) setf.sig fvalue=value // transfer value to FLP side
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) (p_y) st2 [ptr2] = value,-1 //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) (p_n) add ptr2 = 1, ptr2 //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) (p_yy) st1 [ptr2] = value //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) } { .mbb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) (p_yy) add cnt = -1, cnt //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) (p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) nop.m 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) shr.u linecnt = cnt, LSIZE_SH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) (p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) and tmp = -(LINE_SIZE), cnt // compute end of range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) mov ptr9 = ptr1 // used for prefetching
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) and cnt = (LINE_SIZE-1), cnt // remainder
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) } { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) mov loopcnt = PREF_AHEAD-1 // default prefetch loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) (p_scr) add loopcnt = -1, linecnt //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) add ptr1 = tmp, ptr1 // first address beyond total range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) add tmp = -1, linecnt // next loop count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) mov.i ar.lc = loopcnt //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) .pref_l1a:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) nop.i 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) br.cloop.dptk.few .pref_l1a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) add ptr0 = 16, ptr2 // Two stores in parallel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) mov.i ar.lc = tmp //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) .l1ax:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) stf8 [ptr2] = fvalue, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) stf8 [ptr0] = fvalue, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) stf8 [ptr2] = fvalue, 24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) stf8 [ptr0] = fvalue, 24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) stf8 [ptr2] = fvalue, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) stf8 [ptr0] = fvalue, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) stf8 [ptr2] = fvalue, 24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) stf8 [ptr0] = fvalue, 24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) stf8 [ptr2] = fvalue, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) stf8 [ptr0] = fvalue, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) stf8 [ptr2] = fvalue, 24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) stf8 [ptr0] = fvalue, 24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) stf8 [ptr2] = fvalue, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) stf8 [ptr0] = fvalue, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) { .mmb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) stf8 [ptr2] = fvalue, 24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) (p_scr) stf8 [ptr9] = fvalue, 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) br.cloop.dptk.few .l1ax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) { .mbb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) (p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) TEXT_ALIGN(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) .l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) and tmp = -(LINE_SIZE), cnt // compute end of range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) mov ptr9 = ptr1 // used for prefetching
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) and cnt = (LINE_SIZE-1), cnt // remainder
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) } { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) mov loopcnt = PREF_AHEAD-1 // default prefetch loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) (p_scr) add loopcnt = -1, linecnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) add ptr1 = tmp, ptr1 // first address beyond total range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) add tmp = -1, linecnt // next loop count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) mov.i ar.lc = loopcnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) .pref_l1b:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) nop.i 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) br.cloop.dptk.few .pref_l1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) add ptr0 = 16, ptr2 // Two stores in parallel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) mov.i ar.lc = tmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) .l1bx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) stf.spill [ptr2] = f0, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) stf.spill [ptr0] = f0, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) stf.spill [ptr2] = f0, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) stf.spill [ptr0] = f0, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) stf.spill [ptr2] = f0, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) stf.spill [ptr0] = f0, 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) { .mmb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) stf.spill [ptr2] = f0, 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) (p_scr) stf.spill [ptr9] = f0, 128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) br.cloop.dptk.few .l1bx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) (p_scr) br.cond.dpnt.many .move_bytes_from_alignment //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) .fraction_of_line:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) add ptr2 = 16, ptr1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) cmp.eq p_scr, p0 = loopcnt, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) add loopcnt = -1, loopcnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) (p_scr) br.cond.dpnt.many .store_words
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) and cnt = 0x1f, cnt // compute the remaining cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) mov.i ar.lc = loopcnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) TEXT_ALIGN(32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) .l2: // ------------------------------------ // L2A: store 32B in 2 cycles
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) { .mmb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) stf8 [ptr1] = fvalue, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) stf8 [ptr2] = fvalue, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) ;; } { .mmb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) stf8 [ptr1] = fvalue, 24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) stf8 [ptr2] = fvalue, 24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) br.cloop.dptk.many .l2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) .store_words:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) (p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) stf8 [ptr1] = fvalue, 8 // store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) cmp.le p_y, p_n = 16, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) add cnt = -8, cnt // subtract
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) (p_y) stf8 [ptr1] = fvalue, 8 // store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) (p_y) cmp.le.unc p_yy, p_nn = 16, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) (p_y) add cnt = -8, cnt // subtract
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) { .mmi // store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) (p_yy) stf8 [ptr1] = fvalue, 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) (p_yy) add cnt = -8, cnt // subtract
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) .move_bytes_from_alignment:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) cmp.eq p_scr, p0 = cnt, r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) (p_scr) br.cond.dpnt.few .restore_and_exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) (p_y) st4 [ptr1] = value,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) (p_yy) st2 [ptr1] = value,2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) (p_y) st1 [ptr1] = value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) .restore_and_exit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) { .mib
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) nop.m 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) mov.i ar.lc = save_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) .move_bytes_unaligned:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) .pred.rel "mutex",p_y, p_n
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) .pred.rel "mutex",p_yy, p_nn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) (p_n) cmp.le p_yy, p_nn = 4, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) (p_y) cmp.le p_yy, p_nn = 5, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) (p_n) add ptr2 = 2, ptr1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) } { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) (p_y) add ptr2 = 3, ptr1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) (p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) (p_y) add cnt = -1, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) (p_yy) cmp.le.unc p_y, p0 = 8, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) add ptr3 = ptr1, cnt // prepare last store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) mov.i ar.lc = save_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) } { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) (p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) (p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) (p_yy) add cnt = -4, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) (p_y) cmp.le.unc p_yy, p0 = 8, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) add ptr3 = -1, ptr3 // last store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) } { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) (p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) (p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) (p_y) add cnt = -4, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) (p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) (p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) } { .mmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) (p_yy) add cnt = -4, cnt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) ;; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) { .mmb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) (p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) (p_y) st1 [ptr3] = value // fill last byte (using ptr3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) END(memset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) EXPORT_SYMBOL(memset)