/*
 * Copyright (c) 2011 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 *
 *  This file implements the following functions for the Swift micro-arch:
 *
 *  void memset_pattern4(void *b, const void *pattern4, size_t len);
 *  void memset_pattern8(void *b, const void *pattern8, size_t len);
 *  void memset_pattern16(void *b, const void *pattern16, size_t len);
 *
 *  The implementation of all three functions is fundamentally the same.
 *  memset_pattern4 is extensively commented to explain, reference that
 *  if you have any questions about the other two.
 */

#include <arm/arch.h>
#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD

.syntax unified
.code 32
.text
.globl _memset_pattern4$VARIANT$Swift
.globl _memset_pattern8$VARIANT$Swift
.globl _memset_pattern16$VARIANT$Swift

/******************************************************************************/

.align 4
_memset_pattern4$VARIANT$Swift:
    push    {r7,lr}
    mov     r7,         sp

//  Load the pattern and splat it to q0, then check if the buffer is at least
//  64 bytes long.  If not, branch to a short-buffer implementation.
    ldr     r1,        [r1]
    vdup.32 q0,         r1
    subs    r3,     r2, #64
    blo     L_short4

//  We want to use aligned vector stores to fill the bulk of the buffer.  In
//  order to make that work, we need to rotate the pattern as necessary to
//  match up with aligned locations, and we also need to extract the alignment
//  of the destination pointer mod 16.
    lsl     ip,     r0, #3
    and     lr,     r0, #0xf    //  alignment of destination pointer mod 16
    rsb     ip,     ip, #32     //  low five bits contain 32 - 8*(address%4).

//  Before we start the aligned stores, we do a single unaligned store of
//  16 bytes of the pattern to the start of the buffer.  Since the buffer is
//  at least 64 bytes long, this store is known to lie entirely inside the
//  buffer:
//                              first aligned address in buffer
//                                                            v
//   ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
//  ... | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | 0 | 1 | 2 | ...
//   ---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---
//        ^
//        unaligned store starts here:
//      [ 0   1   2   3   0   1   2   3   0   1   2   3   0   1   2   3 ]
    vst1.8  {q0},      [r0]!

//  Subsequent stores will be aligned, and will start at the first aligned
//  address in the buffer.  We apply the rotation that we calculated before
//  the vector store (in the low five bits of ip) to get the pattern that
//  is to be stored starting at the aligned location.  For example, in the
//  picture above, the buffer had alignment of 3 mod 4, so the rotation to
//  be applied is 32 - 8*3 = 8.  Rotating the pattern right by 8 bits gives
//  us [ 1 2 3 0 ] (remember, we're little-endian), which we see is what
//  needs to be stored starting at the first aligned location.
//
//  Besides rotating the pattern, we also need to adjust the length (by
//  subtracting 16 - alignment mod 16), and to advance the pointer to the
//  first aligned location.
    ror     r1,         ip      //  Pattern to use for aligned memory
    add     r3,         lr
    bic     r0,         #0xf    //  destination for first aligned store
    subs    r3,         #16     //  updated length
    blo     1f

//  Splat the rotated value across q1 and q2
    vdup.32 q1,         r1
    vmov    q2,         q1

//  Main store loop.  We write the splatted aligned pattern across 64 bytes
//  per iteration, terminating the loop when the remaining length of the
//  buffer is 64 bytes or less.
0:  subs    r3,         #64
    vst1.32 {q1,q2}, [r0,:128]!
    vst1.32 {q1,q2}, [r0,:128]!
    bhi     0b

//  The remaining length of the buffer is 64 bytes or less (but the total
//  length of the buffer is at least 64 bytes; otherwise we would have
//  branched to the "short" path).  Thus, we can handle the entirety of the
//  remaining buffer with two 32-byte unaligned stores.
//
//  Again, we need to rotate the pattern to match the alignment, this time
//  by 8*(length%4), and we also need to back up the destination pointer
//  so that it points to precisely 64 bytes before the end of the buffer.
//  We accomplish this by adding r3, which contains the remaining length of
//  the buffer minus 64.
1:  lsl     ip,     r3, #3
    ror     r1,         ip
    vdup.32 q8,         r1
    vmov    q9,         q8
    add     r0,         r3
    vst1.32 {q8,q9},   [r0]!
    vst1.32 {q8,q9},   [r0]
    pop     {r7,pc}

L_short4:
//  If we branch here, the buffer is less than 64 bytes long.  At this point,
//  register contents are as follows:
//
//      r0      pointer to the buffer
//      r1      pattern
//      r2      buffer length
//      q0      splatted pattern
//
//  To begin, we store eight bytes at a time until the remaining length is
//  less than eight bytes.
    subs    r3,     r2, #8
    blo     1f
0:  subs    r3,         #8
    vst1.32 {d0},      [r0]!
    bhs     0b

//  Then we store one byte at a time, rotating the pattern to get the next
//  byte, until we reach the end of the buffer.
    add     r2,     r3, #8
1:  subs    r2,         #1
    strbhs  r1,    [r0],#1
    ror     r1,         #8
    bhi     1b
    pop     {r7,pc}

/******************************************************************************/

.align 4
_memset_pattern8$VARIANT$Swift:
//  The implementation of this function is substantially identical to that of
//  memset_pattern4.  The only differences are in how we rotate the pattern for
//  the purposes of extracting the bytes to store.  For clarity, only those
//  differences are commented here; consult memset_pattern4 (above) for
//  a detailed description of the algorithm used.
    push    {r7,lr}
    mov     r7,         sp
    vld1.8  {d0},      [r1]
    vmov    d1,         d0
    subs    r3,     r2, #64
    blo     L_short8

    bic     sp,         #0xf    //  Align stack to 16 bytes and write 32 bytes
    sub     sp,         #16     //  of pattern to the stack.  We will use 
    vst1.8  {q0},    [sp,:128]  //  unaligned loads from this scratch buffer
    sub     sp,         #16     //  to get rotated forms of the pattern.
    vst1.8  {q0},    [sp,:128]
    and     ip,     r0, #0x7    //  Now generate an unaligned pointer to the
    rsb     ip,     ip, #8      //  rotated pattern that we need to use for
    add     ip,         sp      //  aligned stores in the main loop.
    and     lr,     r0, #0xf
    vst1.8  {q0},      [r0]!
    add     r3,         lr
    bic     r0,         #0xf
    subs    r3,         #16
    blo     1f
    vld1.8  {q1},      [ip]
    vmov    q2,         q1
0:  subs    r3,         #64
    vst1.32 {q1,q2}, [r0,:128]!
    vst1.32 {q1,q2}, [r0,:128]!
    bhi     0b
1:  and     lr,     r3, #0x7    //  Generate an unaligned pointer to the
    add     ip,         lr      //  rotated pattern to use for cleanup.
    vld1.8  {q8},      [ip]
    vmov    q9,         q8
    add     r0,         r3
    vst1.32 {q8,q9},   [r0]!
    vst1.32 {q8,q9},   [r0]
    mov     sp,         r7      //  Restore stack pointer
    pop     {r7,pc}

L_short8:
    subs    r2,         #8
    blo     1f
0:  subs    r2,         #8
    vst1.32 {d0},      [r0]!
    bhs     0b
1:  adds    r2,         #8
    beq     3f
2:  vst1.8  {d0[0]},   [r0]!    //  Store one byte from NEON
    vext.8  d0,     d0, d0, #1  //  Use VEXT to rotate pattern
    subs    r2,         #1
    bhi     2b
3:  pop     {r7,pc}

/******************************************************************************/

.align 4
_memset_pattern16$VARIANT$Swift:
//  The implementation of this function is substantially identical to that of
//  memset_pattern4.  The only differences are in how we rotate the pattern for
//  the purposes of extracting the bytes to store.  For clarity, only those
//  differences are commented here; consult memset_pattern4 (above) for
//  a detailed description of the algorithm used.
    push    {r7,lr}
    mov     r7,         sp
    vld1.8  {q0},      [r1]
    subs    r3,     r2, #64
    blo     L_short16

    bic     sp,         #0xf    //  Align stack to 16 bytes and write 48 bytes
    sub     sp,         #16     //  of pattern to the stack.  We will use 
    vst1.8  {q0},    [sp,:128]  //  unaligned loads from this scratch buffer
    sub     sp,         #16     //  to get rotated forms of the pattern.
    vst1.8  {q0},    [sp,:128]
    sub     sp,         #16   
    vst1.8  {q0},    [sp,:128]
    and     lr,     r0, #0xf    //  Now generate an unaligned pointer to the
    rsb     ip,     lr, #16     //  rotated pattern that we need to use for
    add     ip,         sp      //  aligned stores in the main loop.
    vst1.8  {q0},      [r0]!
    add     r3,         lr
    bic     r0,         #0xf
    subs    r3,         #16
    blo     1f
    vld1.8  {q1},      [ip]
    vmov    q2,         q1
0:  subs    r3,         #64
    vst1.32 {q1,q2}, [r0,:128]!
    vst1.32 {q1,q2}, [r0,:128]!
    bhi     0b
1:  and     lr,     r3, #0xf    //  Generate an unaligned pointer to the
    add     ip,         lr      //  rotated pattern to use for cleanup.
    vld1.8  {q8},      [ip]
    vmov    q9,         q8
    add     r0,         r3
    vst1.32 {q8,q9},   [r0]!
    vst1.32 {q8,q9},   [r0]
    mov     sp,         r7      //  Restore stack pointer
    pop     {r7,pc}

L_short16:
    subs    r2,         #16
    blo     1f
0:  subs    r2,         #16
    vst1.32 {q0},      [r0]!
    bhs     0b
1:  adds    r2,         #16
    beq     3f
2:  vst1.8  {d0[0]},   [r0]!    //  Store one byte from NEON
    vext.8  q0,     q0, q0, #1  //  Use VEXT to rotate pattern
    subs    r2,         #1
    bhi     2b
3:  pop     {r7,pc}

#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD