^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0-only */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /* Copyright 2002 Andi Kleen */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) #include <asm/errno.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #include <asm/cpufeatures.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <asm/alternative-asm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) .pushsection .noinstr.text, "ax"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * We build a jump to memcpy_orig by default which gets NOPped out on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * memcpy - Copy a memory block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * Input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * rdi destination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * rsi source
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * rdx count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * Output:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * rax original destination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) SYM_FUNC_START_ALIAS(__memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) SYM_FUNC_START_WEAK(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) "jmp memcpy_erms", X86_FEATURE_ERMS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) movq %rdi, %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) movq %rdx, %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) shrq $3, %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) andl $7, %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) rep movsq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) movl %edx, %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) SYM_FUNC_END(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) SYM_FUNC_END_ALIAS(__memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) EXPORT_SYMBOL(memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) EXPORT_SYMBOL(__memcpy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * memcpy_erms() - enhanced fast string memcpy. This is faster and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * simpler than memcpy. Use memcpy_erms when possible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) SYM_FUNC_START_LOCAL(memcpy_erms)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) movq %rdi, %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) movq %rdx, %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) rep movsb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) SYM_FUNC_END(memcpy_erms)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) SYM_FUNC_START_LOCAL(memcpy_orig)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) movq %rdi, %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) cmpq $0x20, %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) jb .Lhandle_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * We check whether memory false dependence could occur,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * then jump to corresponding copy mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) cmp %dil, %sil
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) jl .Lcopy_backward
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) subq $0x20, %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) .Lcopy_forward_loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) subq $0x20, %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) * Move in blocks of 4x8 bytes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) movq 0*8(%rsi), %r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) movq 1*8(%rsi), %r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) movq 2*8(%rsi), %r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) movq 3*8(%rsi), %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) leaq 4*8(%rsi), %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) movq %r8, 0*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) movq %r9, 1*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) movq %r10, 2*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) movq %r11, 3*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) leaq 4*8(%rdi), %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) jae .Lcopy_forward_loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) addl $0x20, %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) jmp .Lhandle_tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) .Lcopy_backward:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) * Calculate copy position to tail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) addq %rdx, %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) addq %rdx, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) subq $0x20, %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) * At most 3 ALU operations in one cycle,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) * so append NOPS in the same 16 bytes trunk.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) .p2align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) .Lcopy_backward_loop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) subq $0x20, %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) movq -1*8(%rsi), %r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) movq -2*8(%rsi), %r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) movq -3*8(%rsi), %r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) movq -4*8(%rsi), %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) leaq -4*8(%rsi), %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) movq %r8, -1*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) movq %r9, -2*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) movq %r10, -3*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) movq %r11, -4*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) leaq -4*8(%rdi), %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) jae .Lcopy_backward_loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * Calculate copy position to head.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) addl $0x20, %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) subq %rdx, %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) subq %rdx, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) .Lhandle_tail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) cmpl $16, %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) jb .Lless_16bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) * Move data from 16 bytes to 31 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) movq 0*8(%rsi), %r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) movq 1*8(%rsi), %r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) movq -2*8(%rsi, %rdx), %r10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) movq -1*8(%rsi, %rdx), %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) movq %r8, 0*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) movq %r9, 1*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) movq %r10, -2*8(%rdi, %rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) movq %r11, -1*8(%rdi, %rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) retq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) .p2align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) .Lless_16bytes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) cmpl $8, %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) jb .Lless_8bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) * Move data from 8 bytes to 15 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) movq 0*8(%rsi), %r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) movq -1*8(%rsi, %rdx), %r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) movq %r8, 0*8(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) movq %r9, -1*8(%rdi, %rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) retq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) .p2align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) .Lless_8bytes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) cmpl $4, %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) jb .Lless_3bytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) * Move data from 4 bytes to 7 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) movl (%rsi), %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) movl -4(%rsi, %rdx), %r8d
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) movl %ecx, (%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) movl %r8d, -4(%rdi, %rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) retq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) .p2align 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) .Lless_3bytes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) subl $1, %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) jb .Lend
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * Move data from 1 bytes to 3 bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) movzbl (%rsi), %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) jz .Lstore_1byte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) movzbq 1(%rsi), %r8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) movzbq (%rsi, %rdx), %r9
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) movb %r8b, 1(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) movb %r9b, (%rdi, %rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) .Lstore_1byte:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) movb %cl, (%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) .Lend:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) retq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) SYM_FUNC_END(memcpy_orig)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) .popsection