strnlen.s [plain text]

/*
 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 *
 * This file implements the following function for the arm64 architecture:
 *
 *  size_t strnlen(const char *string, size_t maxlen);
 *
 * The strnlen function returns either strlen(string) or maxlen, whichever
 * is amller, without reading beyond the first maxlen characters of string.
 */

.globl _strlen
.globl _strnlen

/*****************************************************************************
 *  Macros                                                                   *
 *****************************************************************************/

.macro EstablishFrame
	stp       fp, lr, [sp, #-16]!
	mov       fp,      sp
.endm

.macro ClearFrameAndReturn
	ldp       fp, lr, [sp], #16
	ret
.endm

/*****************************************************************************
 *  Constants                                                                *
 *****************************************************************************/

.text
.align 5
L_masks:
.quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
.quad 0x0000000000000000, 0x0000000000000000

/*****************************************************************************
 *  strnlen entrypoint                                                       *
 *****************************************************************************/

_strnlen:
//	If n == 0, return NULL without loading any data from s.  If n is so large
//	that it exceeds the size of any buffer that can be allocted, jump into a
//	simpler implementation that omits all length checks.  This is both faster
//	and lets us avoid some messy edgecases in the mainline.
	tst       x1,      x1
	b.mi      _strlen
	b.eq      L_maxlenIsZero
	EstablishFrame
//	Load the 16-byte aligned vector containing the start of the string.
	and       x2,      x0, #-16
	ldr       q0,     [x2]
//	Load a vector {0,1,2, ... ,15} for use in finding the index of the NUL
//	byte once we identify one.  We don't use this vector until the very end
//	of the routine; it simply falls out naturally to load it now.
	adr       x3,          L_masks
	ldr       q2,     [x3],#16
//	The aligned vector that we loaded to q0 contains the start of the string,
//	but if the string was not originally aligned, it also contains bytes
//	which preceed the start of the string, and which may cause false positives
//	when we search for the terminating NUL.  We generate a mask to OR into the
//	vector using an unaligned load to prevent this.  The mask has non-zero
//	values only in those bytes which correspond to bytes preceeding the start
//	of the string in the aligned vector load.
	and       x4,      x0, #0xf
	sub       x3,      x3, x4
	ldr       q1,     [x3]
	orr.16b   v0,      v0, v1
//	Adjust maxlen to account for bytes which preceed the start of the string,
//	and jump into the main scanning loop.
	add       x1,      x1, x4
	b         1f

//	Main loop.  Identical to strlen, except that we also need to check that we
//	don't read more than maxlen bytes.  To that end, we decrement maxlen by 16
//	on each iteration, and exit the loop if the result is zero or negative.
.align 4
0:	ldr       q0,     [x2, #16]!
1:  uminv.16b b1,      v0
	fmov      w3,      s1
	cbz       w3,      L_foundNUL
	subs      x1,      x1, #16
	b.hi      0b

//	We exhausted maxlen bytes without finding a terminating NUL character, so
//  we need to return maxlen.
	sub       x0,      x2, x0
	add       x1,      x1, #16
	add       x0,      x0, x1
	ClearFrameAndReturn

L_maxlenIsZero:
	mov       x0,      xzr
	ret                         // No stack frame, so don't clear it.

L_foundNUL:
//	Compute the index of the NUL byte, and check if it occurs before maxlen
//	bytes into the vector.  If not, return maxlen.  Otherwise, return the
//	length of the string.
	eor.16b   v1,      v1, v1
	cmhi.16b  v0,      v0, v1
	orr.16b   v0,      v0, v2
	uminv.16b b1,      v0
	fmov      w3,      s1      // index of NUL byte in vector
	sub       x0,      x2, x0  // index of vector in string
	cmp       x1,      x3      // if NUL occurs before maxlen bytes
	csel      x1,      x1, x3, cc // return strlen, else maxlen
	add       x0,      x0, x1
	ClearFrameAndReturn

/*****************************************************************************
 *  strlen entrypoint                                                        *
 *****************************************************************************/

.align 4
_strlen:
	EstablishFrame
//	Load the 16-byte aligned vector containing the start of the string.
	and       x1,      x0, #-16
	ldr       q0,     [x1]
//	Load a vector {0,1,2, ... ,15} for use in finding the index of the NUL
//	byte once we identify one.  We don't use this vector until the very end
//	of the routine; it simply falls out naturally to load it now.
	adr       x3,          L_masks
	ldr       q2,     [x3],#16
//	The aligned vector that we loaded to q0 contains the start of the string,
//	but if the string was not originally aligned, it also contains bytes
//	which preceed the start of the string, and which may cause false positives
//	when we search for the terminating NUL.  We generate a mask to OR into the
//	vector using an unaligned load to prevent this.  The mask has non-zero
//	values only in those bytes which correspond to bytes preceeding the start
//	of the string in the aligned vector load.
	and       x2,      x0, #0xf
	sub       x3,      x3, x2
	ldr       q1,     [x3]
	orr.16b   v0,      v0, v1
	b         1f

//	Main loop.  On each iteration we do the following:
//
//		q0 <-- next 16 aligned bytes of string
//		b1 <-- unsigned minimum byte in q0
//      if (b1 != 0) continue
//
//	Thus, we continue the loop until the 16 bytes we load contain a zero byte.
.align 4
0:	ldr       q0,     [x1, #16]!
1:	uminv.16b b1,      v0
	fmov      w2,      s1 // umov.b would be more natural, but requries 2 µops.
	cbnz      w2,      0b

//	A zero byte has been found.  The following registers contain values that
//	we need to compute the string's length:
//
//		x0		pointer to start of string
//		x1		pointer to vector containing terminating NUL byte
//		v0		vector containing terminating NUL byte
//		v2      {0, 1, 2, ... , 15}
//
//	We compute the index of the terminating NUL byte in the string (which is
//	precisely the length of the string) as follows:
//
//		vec <-- mask(v0 != 0) | v2
//		index <-- x1 - x0 + unsignedMinimum(vec)
	eor.16b   v1,      v1, v1
	cmhi.16b  v0,      v0, v1
	orr.16b   v0,      v0, v2
	uminv.16b b1,      v0
	fmov      w2,      s1
	sub       x0,      x1, x0
	add       x0,      x0, x2
	ClearFrameAndReturn