/*
* Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
#include <machine/cpu_capabilities.h>
// *****************
// * S T R N C P Y *
// *****************
//
// char *strncpy(const char *dst, const char *src, size_t n)// We optimize the move by doing it vector parallel. This introduces
// a complication: if we blindly did vector load/stores until finding
// a 0, we might get a spurious page fault by touching bytes past it.
// To avoid this, we never do a load that crosses a page boundary,
// and never store a byte we don't have to.
//
// We align the destination, because unaligned vector stores are slow.
//
// Recall that strncpy() zero fills the remainder of the dest buffer,
// and does not terminate the string if its length is greater than or
// equal to n.
#define kShort 31 // too short to bother with vector loop
.text
.globl _strncpy
.align 4
_strncpy: // char *strncpy(const char *dst, const char *src, size_t n) movl %edi,%ecx // copy low 4 bytes of dest ptr
negl %ecx
andl $15,%ecx // how many bytes to align dest ptr?
jnz LCheckShortCopy // align destination first
// In order to avoid spurious page faults, we loop until nearing the source page
// end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
// then resume the vector loop.
// %rsi = source ptr (unaligned)
// %rdi = dest ptr (aligned)
// %rdx = buffer length remaining
LNextChunk: // NB: can drop down to here
movl %esi,%eax // copy the low 4 bytes of the source ptr
movl $4096,%ecx
andl $4095,%eax // get offset into source page
subl %eax,%ecx // get #bytes remaining in source page
cmpq %rdx,%rcx // will buffer run out before the page end?
cmova %rdx,%rcx // get min(length remaining, bytes to page end)
shrl $4,%ecx // get #chunks till end of page
jnz LLoopOverChunks // enter vector loop
// We can't use the chunk loop yet. Check for short and empty buffers, then use byte loop.
LCrossPage: // if buffer is large enough, cross source page
movl $16,%ecx // move 16 bytes to cross page but keep dest aligned
LCheckShortCopy: // we propose to copy %ecx bytes in byte loop
cmpq $(kShort),%rdx // much left?
ja LLoopOverBytes // yes, loop over bytes then more chunks
movl %edx,%ecx // no, use the byte loop for everything
testl %edx,%edx // have we filled buffer?
jnz LLoopOverBytes // no
jmp LDone
// Loop over bytes.
// %rsi = source ptr
// %rdi = dest ptr
// %rdx = buffer length remaining
// %rcx = count of bytes to loop over (<= buffer length)
.align 4,0x90 // align inner loops to optimize I-fetch
LLoopOverBytes:
movzb (%rsi),%eax // get source byte
addq $1,%rsi
subq $1,%rdx // decrement length
movb %al,(%rdi) // pack into dest
addq $1,%rdi
testl %eax,%eax // 0?
jz LZeroBuffer // yes, we're done copying string
subq $1,%rcx // more to go?
jnz LLoopOverBytes
testq %rdx,%rdx // at end of buffer?
jnz LNextChunk // no, xfer chunks
jmp LDone // yes
// Loop over 16-byte chunks.
// %rsi = source ptr (unaligned)
// %rdi = dest ptr (aligned)
// %rdx = buffer length remaining
// %ecx = chunk count
.align 4,0x90 // align inner loops to optimize I-fetch
LLoopOverChunks:
movdqu (%rsi),%xmm1 // get source
pxor %xmm0,%xmm0 // get some 0s
addq $16,%rsi
pcmpeqb %xmm1,%xmm0 // compare source to 0s
pmovmskb %xmm0,%eax // get result mask for 0 check
testl %eax,%eax // any 0s?
jnz LFound0 // yes, exit loop
movdqa %xmm1,(%rdi) // no 0s so do aligned store into destination
addq $16,%rdi
subq $16,%rdx // decrement length remaining
subl $1,%ecx // more to go?
jnz LLoopOverChunks
jmp LCrossPage // cross page but keep dest aligned
// Found a zero in the vector. Figure out where it is, and store the bytes
// up to it. It is possible that we should check to be sure (%rdx >= 16), and
// just do an aligned store of %xmm1 if so. But if we did, we'd be doing byte
// stores into the same double quadword in bzero(), which might hit a hazard.
// Experimentation needed.
// %rdi = dest ptr (aligned)
// %eax = result mask
// %rdx = buffer length remaining
// %xmm1 = source vector
LFound0:
bsf %eax,%ecx // find first 0
subq %rcx,%rdx // decrement remaining buffer length
test $8,%cl // 8-byte store required?
jz 4f // no
movq %xmm1,(%rdi) // pack in 8 low bytes
psrldq $8,%xmm1 // then shift vector down 8 bytes
addq $8,%rdi
4:
test $4,%cl // 4-byte store required?
jz 3f // no
movd %xmm1,(%rdi) // pack in 4 low bytes
psrldq $4,%xmm1 // then shift vector down 4 bytes
addq $4,%rdi
3:
andl $3,%ecx // more to go?
jz LZeroBuffer // no
movd %xmm1,%eax // move remainders out of vector into %eax
1: // loop on up to three bytes
movb %al,(%rdi) // pack in next byte
shrl $8,%eax // shift next byte into position
addq $1,%rdi
subl $1,%ecx
jnz 1b
// We've copied the string. Now zero the rest of the buffer, using commpage bzero().
// %rdi = dest ptr
// %rcx = buffer length remaining
LZeroBuffer:
movq %rdx,%rsi // remaining buffer size (2nd argument)
subq $8,%rsp // align stack to 16B before call
call _bzero
addq $8,%rsp // restore stack
LDone:
movq %r8,%rax // original dest ptr is return value
ret