^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Optmized version of the standard do_csum() function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Return: a 64bit quantity containing the 16bit Internet checksum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Inputs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * in0: address of buffer to checksum (char *)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * in1: length of the buffer (int)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * Stephane Eranian <eranian@hpl.hp.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * 02/04/22 Ken Chen <kenneth.w.chen@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * Data locality study on the checksum buffer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * More optimization cleanup - remove excessive stop bits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * 02/04/08 David Mosberger <davidm@hpl.hp.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * More cleanup and tuning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * Clean up and optimize and the software pipeline, loading two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * back-to-back 8-byte words per loop. Clean up the initialization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * for the loop. Support the cases where load latency = 1 or 2.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <asm/asmmacro.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) // Theory of operations:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) // The goal is to go as quickly as possible to the point where
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) // we can checksum 16 bytes/loop. Before reaching that point we must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) // take care of incorrect alignment of first byte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) // The code hereafter also takes care of the "tail" part of the buffer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) // before entering the core loop, if any. The checksum is a sum so it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) // allows us to commute operations. So we do the "head" and "tail"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) // first to finish at full speed in the body. Once we get the head and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) // tail values, we feed them into the pipeline, very handy initialization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) // Of course we deal with the special case where the whole buffer fits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) // into one 8 byte word. In this case we have only one entry in the pipeline.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) // We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) // possible load latency and also to accommodate for head and tail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) // The end of the function deals with folding the checksum from 64bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) // down to 16bits taking care of the carry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) // This version avoids synchronization in the core loop by also using a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) // pipeline for the accumulation of the checksum in resultx[] (x=1,2).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) // wordx[] (x=1,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) // |---|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) // | | 0 : new value loaded in pipeline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) // |---|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) // | | - : in transit data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) // |---|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) // | | LOAD_LATENCY : current value to add to checksum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) // |---|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) // | | LOAD_LATENCY+1 : previous value added to checksum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) // |---| (previous iteration)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) // resultx[] (x=1,2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) // |---|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) // | | 0 : initial value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) // |---|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) // | | LOAD_LATENCY-1 : new checksum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) // |---|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) // | | LOAD_LATENCY : previous value of checksum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) // |---|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) // | | LOAD_LATENCY+1 : final checksum when out of the loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) // |---|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) // See RFC1071 "Computing the Internet Checksum" for various techniques for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) // calculating the Internet checksum.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) // NOT YET DONE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) // - Maybe another algorithm which would take care of the folding at the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) // end in a different manner
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) // - Work with people more knowledgeable than me on the network stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) // to figure out if we could not split the function depending on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) // type of packet or alignment we get. Like the ip_fast_csum() routine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) // where we know we have at least 20bytes worth of data to checksum.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) // - Do a better job of handling small packets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) // - Note on prefetching: it was found that under various load, i.e. ftp read/write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) // nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) // on the data that buffer points to (partly because the checksum is often preceded by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) // a copy_from_user()). This finding indiate that lfetch will not be beneficial since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) // the data is already in the cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) #define saved_pfs r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) #define hmask r16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) #define tmask r17
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) #define first1 r18
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) #define firstval r19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) #define firstoff r20
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) #define last r21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) #define lastval r22
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) #define lastoff r23
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) #define saved_lc r24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) #define saved_pr r25
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) #define tmp1 r26
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) #define tmp2 r27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) #define tmp3 r28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) #define carry1 r29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) #define carry2 r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) #define first2 r31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) #define buf in0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) #define len in1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) #define LOAD_LATENCY 2 // XXX fix me
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) #if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) # error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) #define PIPE_DEPTH (LOAD_LATENCY+2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) #define ELD p[LOAD_LATENCY] // end of load
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) #define ELD_1 p[LOAD_LATENCY+1] // and next stage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) // unsigned long do_csum(unsigned char *buf,long len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) GLOBAL_ENTRY(do_csum)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) .prologue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) .save ar.pfs, saved_pfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) alloc saved_pfs=ar.pfs,2,16,0,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) mov ret0=r0 // in case we have zero length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) add tmp1=buf,len // last byte's address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) .save pr, saved_pr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) mov saved_pr=pr // preserve predicates (rotation)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) (p6) br.ret.spnt.many rp // return if zero or negative length
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) mov hmask=-1 // initialize head mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) tbit.nz p15,p0=buf,0 // is buf an odd address?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) and first1=-8,buf // 8-byte align down address of first1 element
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) and firstoff=7,buf // how many bytes off for first1 element
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) mov tmask=-1 // initialize tail mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) adds tmp2=-1,tmp1 // last-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) and lastoff=7,tmp1 // how many bytes off for last element
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) sub tmp1=8,lastoff // complement to lastoff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) and last=-8,tmp2 // address of word containing last byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) sub tmp3=last,first1 // tmp3=distance from first1 to last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) .save ar.lc, saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) mov saved_lc=ar.lc // save lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) cmp.eq p8,p9=last,first1 // everything fits in one word ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) ld8 firstval=[first1],8 // load, ahead of time, "first1" word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) shl tmp2=firstoff,3 // number of bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) (p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) shl tmp1=tmp1,3 // number of bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) (p9) adds tmp3=-8,tmp3 // effectively loaded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) (p8) mov lastval=r0 // we don't need lastval if first1==last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) .body
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) #define count tmp3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) (p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) (p9) and word2[0]=lastval,tmask // mask last it as appropriate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) shr.u count=count,3 // how many 8-byte?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) // If count is odd, finish this 8-byte word so that we can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) // load two back-to-back 8-byte words per loop thereafter.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) and word1[0]=firstval,hmask // and mask it as appropriate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) tbit.nz p10,p11=count,0 // if (count is odd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) (p8) mov result1[0]=word1[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) (p9) add result1[0]=word1[0],word2[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) cmp.ltu p6,p0=result1[0],word1[0] // check the carry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) (p6) adds result1[0]=1,result1[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) (p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) (p11) br.cond.dptk .do_csum16 // if (count is even)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) // Here count is odd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) ld8 word1[1]=[first1],8 // load an 8-byte word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) cmp.eq p9,p10=1,count // if (count == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) adds count=-1,count // loaded an 8-byte word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) add result1[0]=result1[0],word1[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) cmp.ltu p6,p0=result1[0],word1[1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) (p6) adds result1[0]=1,result1[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) (p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) // Fall through to calculate the checksum, feeding result1[0] as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) // the initial value in result1[0].
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) // Calculate the checksum loading two 8-byte words per loop.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) .do_csum16:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) add first2=8,first1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) shr.u count=count,1 // we do 16 bytes per loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) adds count=-1,count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) mov carry1=r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) mov carry2=r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) brp.loop.imp 1f,2f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) mov ar.ec=PIPE_DEPTH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) mov ar.lc=count // set lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) mov pr.rot=1<<16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) // result1[0] must be initialized in advance.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) mov result2[0]=r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) .align 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) (ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) (pC1[1])adds carry1=1,carry1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) (ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) (pC2[1])adds carry2=1,carry2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) (ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) (ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) (p[0]) ld8 word1[0]=[first1],16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) (p[0]) ld8 word2[0]=[first2],16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) br.ctop.sptk 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) // Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) (pC1[1])adds carry1=1,carry1 // since we miss the last one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) (pC2[1])adds carry2=1,carry2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) (p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) (p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) (p6) adds result1[0]=1,result1[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) .do_csum_exit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) // now fold 64 into 16 bits taking care of carry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) // that's not very good because it has lots of sequentiality
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) mov tmp3=0xffff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) zxt4 tmp1=result1[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) shr.u tmp2=result1[0],32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) add result1[0]=tmp1,tmp2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) and tmp1=result1[0],tmp3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) shr.u tmp2=result1[0],16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) add result1[0]=tmp1,tmp2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) and tmp1=result1[0],tmp3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) shr.u tmp2=result1[0],16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) add result1[0]=tmp1,tmp2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) and tmp1=result1[0],tmp3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) shr.u tmp2=result1[0],16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) add ret0=tmp1,tmp2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) // if buf was odd then swap bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) mov ar.pfs=saved_pfs // restore ar.ec
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) (p15) mux1 ret0=ret0,@rev // reverse word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) (p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) // I (Jun Nakajima) wrote an equivalent code (see below), but it was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) // not much better than the original. So keep the original there so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) // someone else can challenge.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) //
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) // shr.u word1[0]=result1[0],32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) // zxt4 result1[0]=result1[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) // ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) // add result1[0]=result1[0],word1[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) // ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) // zxt2 result2[0]=result1[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) // extr.u word1[0]=result1[0],16,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) // shr.u carry1=result1[0],32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) // ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) // add result2[0]=result2[0],word1[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) // ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) // add result2[0]=result2[0],carry1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) // ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) // extr.u ret0=result2[0],16,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) // ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) // add ret0=ret0,result2[0]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) // ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) // zxt2 ret0=ret0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) // mov ar.pfs=saved_pfs // restore ar.ec
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) // mov pr=saved_pr,0xffffffffffff0000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) // ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) // // if buf was odd then swap bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) // mov ar.lc=saved_lc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) //(p15) mux1 ret0=ret0,@rev // reverse word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) // ;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) //(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) // br.ret.sptk.many rp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) END(do_csum)