/*
* Copyright (c) 2011 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*
* This file implements the following functions for the Swift micro-arch:
*
* void memset_pattern4(void *b, const void *pattern4, size_t len);
* void memset_pattern8(void *b, const void *pattern8, size_t len);
* void memset_pattern16(void *b, const void *pattern16, size_t len);
*
* The implementation of all three functions is fundamentally the same.
* memset_pattern4 is extensively commented to explain, reference that
* if you have any questions about the other two.
*/
#include
#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
.syntax unified
.code 32
.text
.globl _memset_pattern4$VARIANT$Swift
.globl _memset_pattern8$VARIANT$Swift
.globl _memset_pattern16$VARIANT$Swift
/******************************************************************************/
.align 4
_memset_pattern4$VARIANT$Swift:
push {r7,lr}
mov r7, sp
// Load the pattern and splat it to q0, then check if the buffer is at least
// 64 bytes long. If not, branch to a short-buffer implementation.
ldr r1, [r1]
vdup.32 q0, r1
subs r3, r2, #64
blo L_short4
// We want to use aligned vector stores to fill the bulk of the buffer. In
// order to make that work, we need to rotate the pattern as necessary to
// match up with aligned locations, and we also need to extract the alignment
// of the destination pointer mod 16.
lsl ip, r0, #3
and lr, r0, #0xf // alignment of destination pointer mod 16
rsb ip, ip, #32 // low five bits contain 32 - 8*(address%4).
// Before we start the aligned stores, we do a single unaligned store of
// 16 bytes of the pattern to the start of the buffer. Since the buffer is
// at least 64 bytes long, this store is known to lie entirely inside the
// buffer:
// first aligned address in buffer
// v
// ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
// ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ...
// ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
// ^
// unaligned store starts here:
// [ 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 ]
vst1.8 {q0}, [r0]!
// Subsequent stores will be aligned, and will start at the first aligned
// address in the buffer. We apply the rotation that we calculated before
// the vector store (in the low five bits of ip) to get the pattern that
// is to be stored starting at the aligned location. For example, in the
// picture above, the buffer had alignment of 3 mod 4, so the rotation to
// be applied is 32 - 8*3 = 8. Rotating the pattern right by 8 bits gives
// us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what
// needs to be stored starting at the first aligned location.
//
// Besides rotating the pattern, we also need to adjust the length (by
// subtracting 16 - alignment mod 16), and to advance the pointer to the
// first aligned location.
ror r1, ip // Pattern to use for aligned memory
add r3, lr
bic r0, #0xf // destination for first aligned store
subs r3, #16 // updated length
blo 1f
// Splat the rotated value across q1 and q2
vdup.32 q1, r1
vmov q2, q1
// Main store loop. We write the splatted aligned pattern across 64 bytes
// per iteration, terminating the loop when the remaining length of the
// buffer is 64 bytes or less.
0: subs r3, #64
vst1.32 {q1,q2}, [r0,:128]!
vst1.32 {q1,q2}, [r0,:128]!
bhi 0b
// The remaining length of the buffer is 64 bytes or less (but the total
// length of the buffer is at least 64 bytes; otherwise we would have
// branched to the "short" path). Thus, we can handle the entirety of the
// remaining buffer with two 32-byte unaligned stores.
//
// Again, we need to rotate the pattern to match the alignment, this time
// by 8*(length%4), and we also need to back up the destination pointer
// so that it points to precisely 64 bytes before the end of the buffer.
// We accomplish this by adding r3, which contains the remaining length of
// the buffer minus 64.
1: lsl ip, r3, #3
ror r1, ip
vdup.32 q8, r1
vmov q9, q8
add r0, r3
vst1.32 {q8,q9}, [r0]!
vst1.32 {q8,q9}, [r0]
pop {r7,pc}
L_short4:
// If we branch here, the buffer is less than 64 bytes long. At this point,
// register contents are as follows:
//
// r0 pointer to the buffer
// r1 pattern
// r2 buffer length
// q0 splatted pattern
//
// To begin, we store eight bytes at a time until the remaining length is
// less than eight bytes.
subs r3, r2, #8
blo 1f
0: subs r3, #8
vst1.32 {d0}, [r0]!
bhs 0b
// Then we store one byte at a time, rotating the pattern to get the next
// byte, until we reach the end of the buffer.
add r2, r3, #8
1: subs r2, #1
strbhs r1, [r0],#1
ror r1, #8
bhi 1b
pop {r7,pc}
/******************************************************************************/
.align 4
_memset_pattern8$VARIANT$Swift:
// The implementation of this function is substantially identical to that of
// memset_pattern4. The only differences are in how we rotate the pattern for
// the purposes of extracting the bytes to store. For clarity, only those
// differences are commented here; consult memset_pattern4 (above) for
// a detailed description of the algorithm used.
push {r7,lr}
mov r7, sp
vld1.8 {d0}, [r1]
vmov d1, d0
subs r3, r2, #64
blo L_short8
bic sp, #0xf // Align stack to 16 bytes and write 32 bytes
sub sp, #16 // of pattern to the stack. We will use
vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
sub sp, #16 // to get rotated forms of the pattern.
vst1.8 {q0}, [sp,:128]
and ip, r0, #0x7 // Now generate an unaligned pointer to the
rsb ip, ip, #8 // rotated pattern that we need to use for
add ip, sp // aligned stores in the main loop.
and lr, r0, #0xf
vst1.8 {q0}, [r0]!
add r3, lr
bic r0, #0xf
subs r3, #16
blo 1f
vld1.8 {q1}, [ip]
vmov q2, q1
0: subs r3, #64
vst1.32 {q1,q2}, [r0,:128]!
vst1.32 {q1,q2}, [r0,:128]!
bhi 0b
1: and lr, r3, #0x7 // Generate an unaligned pointer to the
add ip, lr // rotated pattern to use for cleanup.
vld1.8 {q8}, [ip]
vmov q9, q8
add r0, r3
vst1.32 {q8,q9}, [r0]!
vst1.32 {q8,q9}, [r0]
mov sp, r7 // Restore stack pointer
pop {r7,pc}
L_short8:
subs r2, #8
blo 1f
0: subs r2, #8
vst1.32 {d0}, [r0]!
bhs 0b
1: adds r2, #8
beq 3f
2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
vext.8 d0, d0, d0, #1 // Use VEXT to rotate pattern
subs r2, #1
bhi 2b
3: pop {r7,pc}
/******************************************************************************/
.align 4
_memset_pattern16$VARIANT$Swift:
// The implementation of this function is substantially identical to that of
// memset_pattern4. The only differences are in how we rotate the pattern for
// the purposes of extracting the bytes to store. For clarity, only those
// differences are commented here; consult memset_pattern4 (above) for
// a detailed description of the algorithm used.
push {r7,lr}
mov r7, sp
vld1.8 {q0}, [r1]
subs r3, r2, #64
blo L_short16
bic sp, #0xf // Align stack to 16 bytes and write 48 bytes
sub sp, #16 // of pattern to the stack. We will use
vst1.8 {q0}, [sp,:128] // unaligned loads from this scratch buffer
sub sp, #16 // to get rotated forms of the pattern.
vst1.8 {q0}, [sp,:128]
sub sp, #16
vst1.8 {q0}, [sp,:128]
and lr, r0, #0xf // Now generate an unaligned pointer to the
rsb ip, lr, #16 // rotated pattern that we need to use for
add ip, sp // aligned stores in the main loop.
vst1.8 {q0}, [r0]!
add r3, lr
bic r0, #0xf
subs r3, #16
blo 1f
vld1.8 {q1}, [ip]
vmov q2, q1
0: subs r3, #64
vst1.32 {q1,q2}, [r0,:128]!
vst1.32 {q1,q2}, [r0,:128]!
bhi 0b
1: and lr, r3, #0xf // Generate an unaligned pointer to the
add ip, lr // rotated pattern to use for cleanup.
vld1.8 {q8}, [ip]
vmov q9, q8
add r0, r3
vst1.32 {q8,q9}, [r0]!
vst1.32 {q8,q9}, [r0]
mov sp, r7 // Restore stack pointer
pop {r7,pc}
L_short16:
subs r2, #16
blo 1f
0: subs r2, #16
vst1.32 {q0}, [r0]!
bhs 0b
1: adds r2, #16
beq 3f
2: vst1.8 {d0[0]}, [r0]! // Store one byte from NEON
vext.8 q0, q0, q0, #1 // Use VEXT to rotate pattern
subs r2, #1
bhi 2b
3: pop {r7,pc}
#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD