strcpy.s   [plain text]


/*
 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 *
 * This file implements strcpy( ) for the x86_64 architecture.
 */

.globl _strcpy

/*****************************************************************************
 *  Macros                                                                   *
 *****************************************************************************/

.macro EstablishFrame
	push      %rbp
	mov       %rsp,      %rbp
.endm

.macro ClearFrameAndReturn
	pop       %rbp
	ret
.endm

/*****************************************************************************
 *  Entrypoint                                                               *
 *****************************************************************************/

.text
.align 5
_strcpy:
//	char *strcpy(char * restrict d, const char * restrict s);
//
//	copies the string s to d, and returns d.  We look for NUL bytes using
//	pcmpeqb on 16-byte aligned blocks.  Although this may read past the
//	end of the string, because all access is aligned, it will never
//	read past the end of the string across a page boundary, or even
//	accross a cacheline.
	EstablishFrame
	mov       %rsi,	     %rcx

//	Load the 16-byte block containing the first byte of the string, and
//	compare each byte to zero.  If any NUL bytes are present in this
//	block, the corresponding *bit* in esi will be set to 1.
	and       $-16,      %rsi
	pxor      %xmm0,     %xmm0
	pcmpeqb  (%rsi),     %xmm0
	pmovmskb  %xmm0,     %eax

//	The 16 bytes that we checked for NUL included some bytes preceeding
//	the start of the string, if s is not 16-byte aligned.  We create a
//	mask based on the alignment of s which covers only those bits
//	corresponding to bytes that do not preceed s, and check for NULs
//	only in those bits.  If we find one, the string is too small to use
//  a vector copy, so jump to dedicated small-buffer implementation.
	and       $0xf,      %rcx
	or        $-1,       %rdx
	shl       %cl,       %rdx
	and       %edx,      %eax
	jnz       L_strcpyGPR

//	Check the next 16-byte block for NUL.  If none are found, that guarantees
//	that the string is at least 16 bytes long, which means that we can use a
//	single unaligned vector copy to handle any edging at the start of the
//	string.  If instead a NUL is found, fall into the byte-by-byte copy loop.
	movdqa 16(%rsi),     %xmm1
	pxor      %xmm0,     %xmm0
	pcmpeqb   %xmm1,     %xmm0
	pmovmskb  %xmm0,     %edx
	test      %edx,      %edx
	jz        L_strcpySSE

/*****************************************************************************
 *  GPR copy implementation                                                  *
 *****************************************************************************/

//	There is at least one NUL in the 32 aligned bytes containing the start
//	of the string being copied.  We assemble a bitmap for those 32 bytes from
//	eax and edx, then shift it right by cl to throw out any bits preceeding
//	the start of the string.  We can then identify the position of the
//	first NUL byte using BSF.
	shl       $16,       %edx
	or        %edx,      %eax
L_strcpyGPR:
	shr       %cl,       %eax
	bsf       %eax,      %edx
//	Restore the original source pointer, and copy the destination pointer
//	to rax so that it is returned on exit.
	add       %rcx,      %rsi
	mov       %rdi,      %rax
	add       $1,        %rdx
	call      _memcpy
	ClearFrameAndReturn
/*
//	At this point we simply need to copy rdx + 1 bytes from rsi to rdi.  If
//	the length is >= 8, start by doing a word-by-word copy; otherwise, use
//	a byte-by-byte copy loop.
	sub       $7,        %rdx // 7 instead of 8 to account for NUL
	jb        1f
0:	mov      (%rsi,%rdx),%rcx
	mov       %rcx,     (%rdi,%rdx)
	sub       $8,        %rdx
	jae       0b
1:	add       $8,        %rdx
	jz        3f
2:	movzb  -1(%rsi,%rdx),%rcx
	movb      %cl,    -1(%rdi,%rdx)
	sub       $1,        %rdx
	jnz       2b
3:	ClearFrameAndReturn
 */

/*****************************************************************************
 *  SSE copy implementation                                                  *
 *****************************************************************************/

L_strcpySSE:
//	Begin by doing a single unaligned vector copy for edging.  We no longer
//	have the original source pointer, but we can reconstruct it as rsi + rcx.
	movdqu   (%rsi,%rcx),%xmm0
	movdqu    %xmm0,    (%rdi)
//	Next copy the original destination pointer to rax so that it is returned
//	on exit, and adjust the destination pointer to correspond to rsi.
	mov       %rdi,      %rax
	sub       %rcx,      %rdi
	xor       %rcx,      %rcx
//	Main copy loop: store the 16 bytes loaded in the previous iteration of the
//	loop, as they are already known to not contain a NUL.  The load the next
//	16 bytes and check for NUL.
0:	movdqu    %xmm1,  16(%rdi,%rcx)
	add       $16,       %rcx
	movdqa 16(%rsi,%rcx),%xmm1
	pxor      %xmm0,     %xmm0
	pcmpeqb   %xmm1,     %xmm0
	pmovmskb  %xmm0,     %edx
	test      %edx,      %edx
	jz        0b

//	Cleanup: at least one of the bytes in the last 16 that were loaded was
//	NUL.  The corresponding bits of dx are set, and all other bits are zero.
//	Thus, we can use BSF to find the position of the first NUL.  Once we have
//	this information, we use an unaligned copy that runs precisely up to this
//	position to handle edging.
	bsf       %edx,      %edx
	add       %rdx,      %rcx
	movdqu  1(%rsi,%rcx),%xmm0      // offset is 1 so that we copy the trailing
	movdqu    %xmm0,   1(%rdi,%rcx) // NUL byte as well.
	ClearFrameAndReturn