bzero_Swift.s   [plain text]


/*
 * Copyright (c) 2011 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 *
 *  This file implements the following functions for the Swift micro-arch:
 *
 *  void bzero(void * destination,
 *             size_t length);
 *
 *  void __bzero(void * destination,
 *               size_t length);
 *
 * zeros out a buffer length bytes long, beginning at the address destination.
 *
 *  void *memset(void * destination,
 *               int value,
 *               size_t n);
 *
 * writes value converted to an unsigned char to n successive bytes, beginning
 * at destination.
 */

#include <arm/arch.h>
#if defined _ARM_ARCH_7

.syntax unified
.code 32
.globl ___bzero$VARIANT$Swift
.globl _bzero$VARIANT$Swift
.globl _memset$VARIANT$Swift

.text
.align 4
___bzero$VARIANT$Swift:
_bzero$VARIANT$Swift:
    mov     r2,         r1      //  Set value to zero and move length to the
    eor     r1,         r1      //  correct register to match the memset API.
_memset$VARIANT$Swift:
    push    {r7,lr}             //  Establish a frame, and make a copy of the
    mov     r7,         sp      //  pointer to increment so that we can
    mov     ip,         r0      //  return the original pointer unmodified.

    vdup.8  q0,         r1      //  Splat the low byte of value across q0.
    subs    r3,     r2, #64     //  If length < 64, jump to a dedicated
    blo     L_lengthLessThan64  //  code path to handle small buffers.

    vmov    q1,         q0      //  Copy the splatted value to q1.
    orr     lr,     r2, r0      //  If the length is not a multiple of 16 or
    ands    lr,         #0xf    //  the buffer is not 16-byte aligned, then
    bne     L_edgingNeeded      //  some edging is needed; branch.

0:  subs    r3,         #64     //  Write 64 bytes at a time to the 16-byte
    vst1.8  {q0,q1}, [ip,:128]! //  aligned buffer.  Terminate this loop when
    vst1.8  {q0,q1}, [ip,:128]! //  64 or fewer bytes remain to be written.
    bhi     0b

    add     ip,         r3      //  Backtrack the destination pointer by
    vst1.8  {q0,q1}, [ip,:128]! //  64 - remaining bytes, and write 64 bytes
    vst1.8  {q0,q1}, [ip,:128]  //  to that address.  This takes us precisely
    pop     {r7,pc}             //  to the end of the buffer.

L_edgingNeeded:
    vst1.8  {q0},      [ip]     //  Write 16 bytes to the [possibly unaligned]
    and     lr,     ip, #0xf    //  buffer, then advance the pointer to the
    bic     ip,         #0xf    //  next aligned location, and adjust the
    add     r3,         lr      //  length accordingly.  Note that this means
    add     ip,         #16     //  that the first write in the loop may
    subs    r3,         #16     //  overlap with the write we just performed;
    blo     1f                  //  this is the fastest way to get alignment
    nop                         //  on Swift.

0:  subs    r3,         #64     //  Write 64 bytes at a time to the 16-byte
    vst1.8  {q0,q1}, [ip,:128]! //  aligned buffer.  Terminate this loop when
    vst1.8  {q0,q1}, [ip,:128]! //  64 or fewer bytes remain to be written.
    bhi     0b

1:  add     ip,         r3      //  Backtrack the destination pointer by
    vst1.8  {q0,q1},   [ip]!    //  64 - remaining bytes, and write 64 bytes
    vst1.8  {q0,q1},   [ip]     //  to that address.  This takes us precisely
    pop     {r7,pc}             //  to the end of the buffer.  

L_lengthLessThan64:
    subs    r3,     r2, #8      //  If the length is smaller than eight, jump
    blo     1f                  //  into a dedicated byte store loop.

0:  subs    r3,         #8      //  Write 8 bytes at a time to the destination
    vst1.8  {d0},      [ip]!    //  buffer, terminating when eight or fewer
    bhi     0b                  //  bytes remain to be written.

    add     ip,         r3      //  Backtrack the destination pointer by
    vst1.8  {d0},      [ip]     //  8 - remaining bytes, and write 8 bytes
    pop     {r7,pc}             //  to that address, then return.

1:  subs    r2,         #1      //  Store one byte at a time to the destination
    strbhs  r1,        [ip], #1 //  buffer, until we exhaust the length.
    bhi     1b
    pop     {r7,pc}    

#endif // defined _ARM_ARCH_7