/*
* Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*
* This file implements strcpy( ) for the x86_64 architecture.
*/
.globl _strcpy
/*****************************************************************************
* Macros *
*****************************************************************************/
.macro EstablishFrame
push %rbp
mov %rsp, %rbp
.endm
.macro ClearFrameAndReturn
pop %rbp
ret
.endm
/*****************************************************************************
* Entrypoint *
*****************************************************************************/
.text
.align 5
_strcpy:
// char *strcpy(char * restrict d, const char * restrict s)// copies the string s to d, and returns d. We look for NUL bytes using
// pcmpeqb on 16-byte aligned blocks. Although this may read past the
// end of the string, because all access is aligned, it will never
// read past the end of the string across a page boundary, or even
// accross a cacheline.
EstablishFrame
mov %rsi, %rcx
// Load the 16-byte block containing the first byte of the string, and
// compare each byte to zero. If any NUL bytes are present in this
// block, the corresponding *bit* in esi will be set to 1.
and $-16, %rsi
pxor %xmm0, %xmm0
pcmpeqb (%rsi), %xmm0
pmovmskb %xmm0, %eax
// The 16 bytes that we checked for NUL included some bytes preceeding
// the start of the string, if s is not 16-byte aligned. We create a
// mask based on the alignment of s which covers only those bits
// corresponding to bytes that do not preceed s, and check for NULs
// only in those bits. If we find one, the string is too small to use
// a vector copy, so jump to dedicated small-buffer implementation.
and $0xf, %rcx
or $-1, %rdx
shl %cl, %rdx
and %edx, %eax
jnz L_strcpyGPR
// Check the next 16-byte block for NUL. If none are found, that guarantees
// that the string is at least 16 bytes long, which means that we can use a
// single unaligned vector copy to handle any edging at the start of the
// string. If instead a NUL is found, fall into the byte-by-byte copy loop.
movdqa 16(%rsi), %xmm1
pxor %xmm0, %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jz L_strcpySSE
/*****************************************************************************
* GPR copy implementation *
*****************************************************************************/
// There is at least one NUL in the 32 aligned bytes containing the start
// of the string being copied. We assemble a bitmap for those 32 bytes from
// eax and edx, then shift it right by cl to throw out any bits preceeding
// the start of the string. We can then identify the position of the
// first NUL byte using BSF.
shl $16, %edx
or %edx, %eax
L_strcpyGPR:
shr %cl, %eax
bsf %eax, %edx
// Restore the original source pointer, and copy the destination pointer
// to rax so that it is returned on exit.
add %rcx, %rsi
mov %rdi, %rax
add $1, %rdx
call _memcpy
ClearFrameAndReturn
/*
// At this point we simply need to copy rdx + 1 bytes from rsi to rdi. If
// the length is >= 8, start by doing a word-by-word copy sub $7, %rdx // 7 instead of 8 to account for NUL
jb 1f
0: mov (%rsi,%rdx),%rcx
mov %rcx, (%rdi,%rdx)
sub $8, %rdx
jae 0b
1: add $8, %rdx
jz 3f
2: movzb -1(%rsi,%rdx),%rcx
movb %cl, -1(%rdi,%rdx)
sub $1, %rdx
jnz 2b
3: ClearFrameAndReturn
*/
/*****************************************************************************
* SSE copy implementation *
*****************************************************************************/
L_strcpySSE:
// Begin by doing a single unaligned vector copy for edging. We no longer
// have the original source pointer, but we can reconstruct it as rsi + rcx.
movdqu (%rsi,%rcx),%xmm0
movdqu %xmm0, (%rdi)
// Next copy the original destination pointer to rax so that it is returned
// on exit, and adjust the destination pointer to correspond to rsi.
mov %rdi, %rax
sub %rcx, %rdi
xor %rcx, %rcx
// Main copy loop: store the 16 bytes loaded in the previous iteration of the
// loop, as they are already known to not contain a NUL. The load the next
// 16 bytes and check for NUL.
0: movdqu %xmm1, 16(%rdi,%rcx)
add $16, %rcx
movdqa 16(%rsi,%rcx),%xmm1
pxor %xmm0, %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jz 0b
// Cleanup: at least one of the bytes in the last 16 that were loaded was
// NUL. The corresponding bits of dx are set, and all other bits are zero.
// Thus, we can use BSF to find the position of the first NUL. Once we have
// this information, we use an unaligned copy that runs precisely up to this
// position to handle edging.
bsf %edx, %edx
add %rdx, %rcx
movdqu 1(%rsi,%rcx),%xmm0 // offset is 1 so that we copy the trailing
movdqu %xmm0, 1(%rdi,%rcx) // NUL byte as well.
ClearFrameAndReturn