^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0-or-later */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Author: Anton Blanchard <anton@au.ibm.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright 2015 IBM Corporation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #include <asm/ppc_asm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <asm/ppc-opcode.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #define off8 r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #define off16 r7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #define off24 r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #define rA r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #define rB r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #define rC r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #define rD r27
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #define rE r28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #define rF r29
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #define rG r30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #define rH r31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #ifdef __LITTLE_ENDIAN__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #define LH lhbrx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #define LW lwbrx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #define LD ldbrx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #define LVS lvsr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #define VPERM(_VRT,_VRA,_VRB,_VRC) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) vperm _VRT,_VRB,_VRA,_VRC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #define LH lhzx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #define LW lwzx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #define LD ldx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #define LVS lvsl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #define VPERM(_VRT,_VRA,_VRB,_VRC) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) vperm _VRT,_VRA,_VRB,_VRC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #define VMX_THRESH 4096
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #define ENTER_VMX_OPS \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) mflr r0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) std r0,16(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) stdu r1,-STACKFRAMESIZE(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) bl enter_vmx_ops; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) cmpwi cr1,r3,0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) ld r0,STACKFRAMESIZE+16(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) ld r3,STK_REG(R31)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) ld r4,STK_REG(R30)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) ld r5,STK_REG(R29)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) addi r1,r1,STACKFRAMESIZE; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) mtlr r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #define EXIT_VMX_OPS \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) mflr r0; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) std r0,16(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) stdu r1,-STACKFRAMESIZE(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) bl exit_vmx_ops; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) ld r0,STACKFRAMESIZE+16(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) ld r3,STK_REG(R31)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) ld r4,STK_REG(R30)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) ld r5,STK_REG(R29)(r1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) addi r1,r1,STACKFRAMESIZE; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) mtlr r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) * 16 bytes boundary and permute the result with the 1st 16 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) * ^ ^ ^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * 0xbbbb10 0xbbbb20 0xbbb30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * ^
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) * _vaddr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) * _vmask is the mask generated by LVS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) * for example: 0xyyyyyyyyyyyyy012 for big endian
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) * for example: 0x3456789abcdefzzz for big endian
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) * The permute result is saved in _v_res.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) * for example: 0x0123456789abcdef for big endian.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) lvx _v2nd_qw,_vaddr,off16; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) * There are 2 categories for memcmp:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) * are named like .Lsameoffset_xxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) * 2) src/dst has different offset to the 8 bytes boundary. The handlers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) * are named like .Ldiffoffset_xxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) _GLOBAL_TOC(memcmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) cmpdi cr1,r5,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) /* Use the short loop if the src/dst addresses are not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) * with the same offset of 8 bytes align boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) xor r6,r3,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) andi. r6,r6,7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) /* Fall back to short loop if compare at aligned addrs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) * with less than 8 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) cmpdi cr6,r5,7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) beq cr1,.Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) bgt cr6,.Lno_short
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) .Lshort:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) mtctr r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) 1: lbz rA,0(r3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) lbz rB,0(r4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) subf. rC,rB,rA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) bne .Lnon_zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) bdz .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) lbz rA,1(r3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) lbz rB,1(r4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) subf. rC,rB,rA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) bne .Lnon_zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) bdz .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) lbz rA,2(r3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) lbz rB,2(r4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) subf. rC,rB,rA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) bne .Lnon_zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) bdz .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) lbz rA,3(r3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) lbz rB,3(r4)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) subf. rC,rB,rA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) bne .Lnon_zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) addi r3,r3,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) addi r4,r4,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) bdnz 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) .Lzero:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) li r3,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) blr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) .Lno_short:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) dcbt 0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) dcbt 0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) bne .Ldiffoffset_8bytes_make_align_start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) .Lsameoffset_8bytes_make_align_start:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) /* attempt to compare bytes not aligned with 8 bytes so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) * rest comparison can run based on 8 bytes alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) andi. r6,r3,7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) /* Try to compare the first double word which is not 8 bytes aligned:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) * load the first double word at (src & ~7UL) and shift left appropriate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) * bits before comparision.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) rlwinm r6,r3,3,26,28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) beq .Lsameoffset_8bytes_aligned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) clrrdi r3,r3,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) clrrdi r4,r4,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) LD rB,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) sld rA,rA,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) sld rB,rB,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) srwi r6,r6,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) bne cr0,.LcmpAB_lightweight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) subfic r6,r6,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) subf. r5,r6,r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) addi r3,r3,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) addi r4,r4,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) beq .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) .Lsameoffset_8bytes_aligned:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) /* now we are aligned with 8 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) * Use .Llong loop if left cmp bytes are equal or greater than 32B.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) cmpdi cr6,r5,31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) bgt cr6,.Llong
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) .Lcmp_lt32bytes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) cmpdi cr5,r5,7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) srdi r0,r5,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) ble cr5,.Lcmp_rest_lt8bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) /* handle 8 ~ 31 bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) clrldi r5,r5,61
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) mtctr r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) LD rB,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) addi r3,r3,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) addi r4,r4,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) bne cr0,.LcmpAB_lightweight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) bdnz 2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) cmpwi r5,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) beq .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) .Lcmp_rest_lt8bytes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) * Here we have less than 8 bytes to compare. At least s1 is aligned to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) * page boundary, otherwise we might read past the end of the buffer and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) * trigger a page fault. We use 4K as the conservative minimum page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) * size. If we detect that case we go to the byte-by-byte loop.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) * Otherwise the next double word is loaded from s1 and s2, and shifted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) * right to compare the appropriate bits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) clrldi r6,r4,(64-12) // r6 = r4 & 0xfff
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) cmpdi r6,0xff8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) bgt .Lshort
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) subfic r6,r5,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) slwi r6,r6,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) LD rB,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) srd rA,rA,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) srd rB,rB,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) bne cr0,.LcmpAB_lightweight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) b .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) .Lnon_zero:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) mr r3,rC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) blr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) .Llong:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) #ifdef CONFIG_ALTIVEC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) BEGIN_FTR_SECTION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) /* Try to use vmx loop if length is equal or greater than 4K */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) cmpldi cr6,r5,VMX_THRESH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) bge cr6,.Lsameoffset_vmx_cmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) .Llong_novmx_cmp:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) /* At least s1 addr is aligned with 8 bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) li off8,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) li off16,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) li off24,24
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) std r31,-8(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) std r30,-16(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) std r29,-24(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) std r28,-32(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) std r27,-40(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) srdi r0,r5,5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) mtctr r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) andi. r5,r5,31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) LD rB,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) LD rC,off8,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) LD rD,off8,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) LD rE,off16,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) LD rF,off16,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) LD rG,off24,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) LD rH,off24,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) addi r3,r3,32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) addi r4,r4,32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) bdz .Lfirst32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) LD rB,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) cmpld cr1,rC,rD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) LD rC,off8,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) LD rD,off8,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) cmpld cr6,rE,rF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) LD rE,off16,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) LD rF,off16,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) cmpld cr7,rG,rH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) bne cr0,.LcmpAB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) LD rG,off24,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) LD rH,off24,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) bne cr1,.LcmpCD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) addi r3,r3,32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) addi r4,r4,32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) bdz .Lsecond32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) .balign 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) 1: LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) LD rB,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) cmpld cr1,rC,rD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) bne cr6,.LcmpEF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) LD rC,off8,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) LD rD,off8,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) cmpld cr6,rE,rF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) bne cr7,.LcmpGH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) LD rE,off16,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) LD rF,off16,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) cmpld cr7,rG,rH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) bne cr0,.LcmpAB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) LD rG,off24,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) LD rH,off24,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) bne cr1,.LcmpCD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) addi r3,r3,32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) addi r4,r4,32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) bdnz 1b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) .Lsecond32:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) cmpld cr1,rC,rD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) bne cr6,.LcmpEF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) cmpld cr6,rE,rF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) bne cr7,.LcmpGH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) cmpld cr7,rG,rH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) bne cr0,.LcmpAB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) bne cr1,.LcmpCD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) bne cr6,.LcmpEF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) bne cr7,.LcmpGH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) .Ltail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) ld r31,-8(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) ld r30,-16(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) ld r29,-24(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) ld r28,-32(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) ld r27,-40(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) cmpdi r5,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) beq .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) b .Lshort
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) .Lfirst32:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) cmpld cr1,rC,rD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) cmpld cr6,rE,rF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) cmpld cr7,rG,rH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) bne cr0,.LcmpAB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) bne cr1,.LcmpCD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) bne cr6,.LcmpEF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) bne cr7,.LcmpGH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) b .Ltail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) .LcmpAB:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) li r3,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) bgt cr0,.Lout
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) li r3,-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) b .Lout
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) .LcmpCD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) li r3,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) bgt cr1,.Lout
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) li r3,-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) b .Lout
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) .LcmpEF:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) li r3,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) bgt cr6,.Lout
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) li r3,-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) b .Lout
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) .LcmpGH:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) li r3,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) bgt cr7,.Lout
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) li r3,-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) .Lout:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) ld r31,-8(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) ld r30,-16(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) ld r29,-24(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) ld r28,-32(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) ld r27,-40(r1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) blr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) .LcmpAB_lightweight: /* skip NV GPRS restore */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) li r3,1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) bgtlr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) li r3,-1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) blr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) #ifdef CONFIG_ALTIVEC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) .Lsameoffset_vmx_cmp:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) /* Enter with src/dst addrs has the same offset with 8 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) * align boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) * There is an optimization based on following fact: memcmp()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) * prones to fail early at the first 32 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) * Before applying VMX instructions which will lead to 32x128bits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) * VMX regs load/restore penalty, we compare the first 32 bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) * so that we can catch the ~80% fail cases.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) li r0,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) mtctr r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) .Lsameoffset_prechk_32B_loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) LD rB,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) addi r3,r3,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) addi r4,r4,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) bne cr0,.LcmpAB_lightweight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) addi r5,r5,-8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) bdnz .Lsameoffset_prechk_32B_loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) ENTER_VMX_OPS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) beq cr1,.Llong_novmx_cmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) 3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) /* need to check whether r4 has the same offset with r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) * for 16 bytes boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) xor r0,r3,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) andi. r0,r0,0xf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) bne .Ldiffoffset_vmx_cmp_start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) /* len is no less than 4KB. Need to align with 16 bytes further.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) andi. rA,r3,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) beq 4f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) LD rB,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) addi r3,r3,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) addi r4,r4,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) addi r5,r5,-8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) beq cr0,4f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) /* save and restore cr0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) mfocrf r5,128
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) EXIT_VMX_OPS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) mtocrf 128,r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) b .LcmpAB_lightweight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) 4:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) /* compare 32 bytes for each loop */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) srdi r0,r5,5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) mtctr r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) clrldi r5,r5,59
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) li off16,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) .balign 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 5:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) lvx v0,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) lvx v1,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) VCMPEQUD_RC(v0,v0,v1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) bnl cr6,7f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) lvx v0,off16,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) lvx v1,off16,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) VCMPEQUD_RC(v0,v0,v1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) bnl cr6,6f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) addi r3,r3,32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) addi r4,r4,32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) bdnz 5b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) EXIT_VMX_OPS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) cmpdi r5,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) beq .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) b .Lcmp_lt32bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) 6:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) addi r3,r3,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) addi r4,r4,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) 7:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) /* diff the last 16 bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) EXIT_VMX_OPS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) LD rB,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) li off8,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) bne cr0,.LcmpAB_lightweight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) LD rA,off8,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) LD rB,off8,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) bne cr0,.LcmpAB_lightweight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) b .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) .Ldiffoffset_8bytes_make_align_start:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) /* now try to align s1 with 8 bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) rlwinm r6,r3,3,26,28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) beq .Ldiffoffset_align_s1_8bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) clrrdi r3,r3,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) LD rB,0,r4 /* unaligned load */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) sld rA,rA,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) srd rA,rA,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) srd rB,rB,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) srwi r6,r6,3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) bne cr0,.LcmpAB_lightweight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) subfic r6,r6,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) subf. r5,r6,r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) addi r3,r3,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) add r4,r4,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) beq .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) .Ldiffoffset_align_s1_8bytes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) /* now s1 is aligned with 8 bytes. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) #ifdef CONFIG_ALTIVEC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) BEGIN_FTR_SECTION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) /* only do vmx ops when the size equal or greater than 4K bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) cmpdi cr5,r5,VMX_THRESH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) bge cr5,.Ldiffoffset_vmx_cmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) .Ldiffoffset_novmx_cmp:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) cmpdi cr5,r5,31
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) ble cr5,.Lcmp_lt32bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) #ifdef CONFIG_ALTIVEC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) b .Llong_novmx_cmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) b .Llong
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) #ifdef CONFIG_ALTIVEC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) .Ldiffoffset_vmx_cmp:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) /* perform a 32 bytes pre-checking before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) * enable VMX operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) li r0,4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) mtctr r0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) .Ldiffoffset_prechk_32B_loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) LD rA,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) LD rB,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) cmpld cr0,rA,rB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) addi r3,r3,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) addi r4,r4,8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) bne cr0,.LcmpAB_lightweight
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) addi r5,r5,-8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) bdnz .Ldiffoffset_prechk_32B_loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) ENTER_VMX_OPS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) beq cr1,.Ldiffoffset_novmx_cmp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) .Ldiffoffset_vmx_cmp_start:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) /* Firstly try to align r3 with 16 bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) andi. r6,r3,0xf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) li off16,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) beq .Ldiffoffset_vmx_s1_16bytes_align
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) LVS v3,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) LVS v4,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) lvx v5,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) lvx v6,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) VCMPEQUB_RC(v7,v9,v10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) bnl cr6,.Ldiffoffset_vmx_diff_found
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) subfic r6,r6,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) subf r5,r6,r5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) add r3,r3,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) add r4,r4,r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) .Ldiffoffset_vmx_s1_16bytes_align:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) /* now s1 is aligned with 16 bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) lvx v6,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) LVS v4,0,r4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) srdi r6,r5,5 /* loop for 32 bytes each */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) clrldi r5,r5,59
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) mtctr r6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) .balign 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) .Ldiffoffset_vmx_32bytesloop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) /* the first qw of r4 was saved in v6 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) lvx v9,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) VCMPEQUB_RC(v7,v9,v10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) vor v6,v8,v8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) bnl cr6,.Ldiffoffset_vmx_diff_found
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) addi r3,r3,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) addi r4,r4,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) lvx v9,0,r3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) VCMPEQUB_RC(v7,v9,v10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) vor v6,v8,v8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) bnl cr6,.Ldiffoffset_vmx_diff_found
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) addi r3,r3,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) addi r4,r4,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) bdnz .Ldiffoffset_vmx_32bytesloop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) EXIT_VMX_OPS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) cmpdi r5,0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) beq .Lzero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) b .Lcmp_lt32bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) .Ldiffoffset_vmx_diff_found:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) EXIT_VMX_OPS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) /* anyway, the diff will appear in next 16 bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) li r5,16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) b .Lcmp_lt32bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) EXPORT_SYMBOL(memcmp)